In [5]:
import pandas as pd
from tsfresh import extract_features
from tsfresh.utilities.dataframe_functions import impute
from sklearn.feature_selection import VarianceThreshold

In [6]:
def extract_tsfresh_features(input_csv, output_csv):
    df = pd.read_csv(input_csv)
    df["date"] = pd.to_datetime(df["date"])

    ts_df = df[["participant_id", "date", "avg_heart_rate", "daily_steps", "hours_sleep"]]

    ts_df = ts_df.melt(
        id_vars=["participant_id", "date"],
        var_name="kind",
        value_name="value"
    )

    features = extract_features(
        ts_df,
        column_id="participant_id",
        column_sort="date",
        column_kind="kind",
        column_value="value"
    )

    impute(features)

    selector = VarianceThreshold(threshold=0.01)
    reduced_features = selector.fit_transform(features)

    selected_features = features.loc[:, selector.get_support()]
    selected_features.to_csv(output_csv)


In [7]:
extract_tsfresh_features("data/cleaned_health_fitness_data.csv", "data/tsfresh_features.csv")

Feature Extraction: 100%|██████████| 30/30 [03:43<00:00,  7.45s/it]
 'daily_steps__query_similarity_count__query_None__threshold_0.0'
 'hours_sleep__query_similarity_count__query_None__threshold_0.0'] did not have any finite values. Filling with zeros.
