In [1]:
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

from tsfresh import (
    extract_features,
    extract_relevant_features,
    select_features,
)
from tsfresh.utilities.dataframe_functions import impute

In [2]:
# load data

X = pd.read_csv("occupancy.csv", parse_dates=["date"])

y = pd.read_csv("occupancy_target.csv", index_col="id")
y = pd.Series(y["occupancy"])

In [3]:
# create features for one time series: light

features = extract_features(
    X[["id", "light"]],
    column_id="id",
    impute_function=impute,
)

features.shape

Feature Extraction: 100%|█████████████████████████████████████████████████████████| 10/10 [00:08<00:00,  1.16it/s]


(135, 789)

In [4]:
# features and target have the same shape

features.shape, y.shape

((135, 789), (135,))

In [5]:
# select relevant features

features = select_features(features, y)

len(features)

135

In [6]:
features.head()

Unnamed: 0,light__minimum,"light__agg_linear_trend__attr_""intercept""__chunk_len_50__f_agg_""min""",light__quantile__q_0.1,light__quantile__q_0.3,light__quantile__q_0.4,light__quantile__q_0.2,light__median,light__quantile__q_0.6,"light__agg_linear_trend__attr_""intercept""__chunk_len_10__f_agg_""min""",light__quantile__q_0.7,...,light__index_mass_quantile__q_0.2,light__ratio_beyond_r_sigma__r_2.5,light__index_mass_quantile__q_0.8,"light__agg_autocorrelation__f_agg_""mean""__maxlag_40","light__cwt_coefficients__coeff_11__w_2__widths_(2, 5, 10, 20)",light__index_mass_quantile__q_0.6,light__index_mass_quantile__q_0.7,light__index_mass_quantile__q_0.3,"light__agg_autocorrelation__f_agg_""median""__maxlag_40",light__ar_coefficient__coeff_2__k_10
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.033333,0.116667,0.1,-0.010369,-137.454032,0.083333,0.083333,0.05,-0.068578,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.2,0.0,0.8,0.0,0.0,0.6,0.7,0.3,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.2,0.0,0.8,0.0,0.0,0.6,0.7,0.3,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.2,0.0,0.8,0.0,0.0,0.6,0.7,0.3,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.2,0.0,0.8,0.0,0.0,0.6,0.7,0.3,0.0,0.0


In [7]:
# select 5 features to display in book recipe

feats = features.columns[0:5]

feats

Index(['light__minimum',
       'light__agg_linear_trend__attr_"intercept"__chunk_len_50__f_agg_"min"',
       'light__quantile__q_0.1', 'light__quantile__q_0.3',
       'light__quantile__q_0.4'],
      dtype='object')

In [8]:
# display some features (for book)

features[feats].head()

Unnamed: 0,light__minimum,"light__agg_linear_trend__attr_""intercept""__chunk_len_50__f_agg_""min""",light__quantile__q_0.1,light__quantile__q_0.3,light__quantile__q_0.4
1,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0


In [9]:
# split into train and test

X_train, X_test, y_train, y_test = train_test_split(
    features,
    y,
    test_size=0.1,
    random_state=42,
)

In [10]:
# train and evaluate logistic regression

cls = LogisticRegression(random_state=10, C=0.1, max_iter=1000)
cls.fit(X_train, y_train)

print(classification_report(y_test, cls.predict(X_test)))

              precision    recall  f1-score   support

           0       1.00      0.91      0.95        11
           1       0.75      1.00      0.86         3

    accuracy                           0.93        14
   macro avg       0.88      0.95      0.90        14
weighted avg       0.95      0.93      0.93        14



In [11]:
# create and select features for all time series
# automatically

features = extract_relevant_features(
    X,
    y,
    column_id="id",
    column_sort="date",
)

features.shape

Feature Extraction: 100%|█████████████████████████████████████████████████████████| 10/10 [00:30<00:00,  3.03s/it]


(135, 969)

In [12]:
# split into train and test

X_train, X_test, y_train, y_test = train_test_split(
    features,
    y,
    test_size=0.1,
    random_state=42,
)

In [13]:
# train and evaluate logistic regression

cls = LogisticRegression(random_state=10, C=0.000000000000001)
cls.fit(X_train, y_train)

print(classification_report(y_test, cls.predict(X_test)))

              precision    recall  f1-score   support

           0       1.00      0.91      0.95        11
           1       0.75      1.00      0.86         3

    accuracy                           0.93        14
   macro avg       0.88      0.95      0.90        14
weighted avg       0.95      0.93      0.93        14

