In [1]:
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

from tsfresh.feature_extraction import extract_features
from tsfresh.feature_extraction import settings

In [2]:
# load data

X = pd.read_csv("occupancy.csv", parse_dates=["date"])

y = pd.read_csv("occupancy_target.csv", index_col="id")
y = pd.Series(y["occupancy"])

In [3]:
# select minimal number of features to create

minimal_feat = settings.MinimalFCParameters()

minimal_feat.items()

ItemsView({'sum_values': None, 'median': None, 'mean': None, 'length': None, 'standard_deviation': None, 'variance': None, 'root_mean_square': None, 'maximum': None, 'absolute_maximum': None, 'minimum': None})

In [4]:
# create minimal number of features

features = extract_features(
    X[["id", "light"]],
    column_id="id",
    default_fc_parameters=minimal_feat,
)

features.shape

Feature Extraction: 100%|█████████████████████████████████████████████████████████| 10/10 [00:03<00:00,  2.95it/s]


(135, 10)

In [5]:
features.head()

Unnamed: 0,light__sum_values,light__median,light__mean,light__length,light__standard_deviation,light__variance,light__root_mean_square,light__maximum,light__absolute_maximum,light__minimum
1,2932.5,0.0,48.875,60.0,134.485582,18086.371875,143.091361,419.0,419.0,0.0
2,0.0,0.0,0.0,60.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,60.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,60.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,60.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# split data

X_train, X_test, y_train, y_test = train_test_split(
    features,
    y,
    test_size=0.1,
    random_state=42,
)

In [7]:
# train and evaluate classifier

cls = LogisticRegression(random_state=10, C=0.01)
cls.fit(X_train, y_train)

print(classification_report(y_test, cls.predict(X_test)))

              precision    recall  f1-score   support

           0       1.00      0.91      0.95        11
           1       0.75      1.00      0.86         3

    accuracy                           0.93        14
   macro avg       0.88      0.95      0.90        14
weighted avg       0.95      0.93      0.93        14



In [8]:
# features that are efficient to compute

efficient_feat = settings.EfficientFCParameters()

len(efficient_feat.items())

74

In [9]:
# comprehensive list of features

comprehensive_feat = settings.ComprehensiveFCParameters()

len(comprehensive_feat.items())

76

In [10]:
# different features for different time series

light_feat = {
    "sum_values": None,
    "median": None,
    "standard_deviation": None,
    "quantile": [{"q": 0.2}, {"q": 0.7}],
}

co2_feat = {"root_mean_square": None, "number_peaks": [{"n": 1}, {"n": 2}]}

In [11]:
kind_to_fc_parameters = {
    "light": light_feat,
    "co2": co2_feat,
}

In [12]:
# create different features for different time series

features = extract_features(
    X[["id", "light", "co2"]],
    column_id="id",
    kind_to_fc_parameters=kind_to_fc_parameters,
)

features.shape

Feature Extraction: 100%|█████████████████████████████████████████████████████████| 10/10 [00:04<00:00,  2.34it/s]


(135, 8)

In [13]:
features.columns

Index(['light__sum_values', 'light__median', 'light__standard_deviation',
       'light__quantile__q_0.2', 'light__quantile__q_0.7',
       'co2__root_mean_square', 'co2__number_peaks__n_1',
       'co2__number_peaks__n_2'],
      dtype='object')

In [14]:
features.head()

Unnamed: 0,light__sum_values,light__median,light__standard_deviation,light__quantile__q_0.2,light__quantile__q_0.7,co2__root_mean_square,co2__number_peaks__n_1,co2__number_peaks__n_2
1,2932.5,0.0,134.485582,0.0,0.0,656.304517,16.0,6.0
2,0.0,0.0,0.0,0.0,0.0,573.234731,12.0,10.0
3,0.0,0.0,0.0,0.0,0.0,521.507749,8.0,5.0
4,0.0,0.0,0.0,0.0,0.0,493.834529,12.0,9.0
5,0.0,0.0,0.0,0.0,0.0,476.999701,15.0,8.0


In [15]:
# split data

X_train, X_test, y_train, y_test = train_test_split(
    features,
    y,
    test_size=0.1,
    random_state=42,
)

In [16]:
# train and evaluate classifier

cls = LogisticRegression(random_state=10, C=0.000000000000001)
cls.fit(X_train, y_train)

print(classification_report(y_test, cls.predict(X_test)))

              precision    recall  f1-score   support

           0       1.00      0.64      0.78        11
           1       0.43      1.00      0.60         3

    accuracy                           0.71        14
   macro avg       0.71      0.82      0.69        14
weighted avg       0.88      0.71      0.74        14

