In [1]:
# This notebook uses Chronos's embedding as a feature extractor for Random Forest

import numpy as np
from sklearn.metrics import confusion_matrix, accuracy_score
from chronos import ChronosBoltPipeline
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import os

model_id = "amazon/chronos-bolt-base"

pipeline = ChronosBoltPipeline.from_pretrained(
    "amazon/chronos-bolt-base",
    device_map="cuda:1",
    torch_dtype=torch.float32,
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from numpy.fft import fft

# HARTH dataset location is ./data/harth

class HARTHDataset(Dataset):
    def __init__(self, file_names):
        self.sessions = []
        self.labels = []
        self.features = []
        chunk_size = 150
        step_size = 75

        for file_name in file_names:
            print(f"Processing {file_name}")
            raw_path = os.path.join("data", "harth", file_name)
            df = pd.read_csv(raw_path)

            # Filter for the labels 1-8
            df_filtered = df[df['label'] <= 8].copy()
            # Adjust labels to be 0-indexed for CrossEntropyLoss
            df_filtered['label'] = df_filtered['label'] - 1

            for i in range(0, len(df_filtered) - chunk_size, step_size):
                chunk = df_filtered.iloc[i : i + chunk_size]

                # Ensure the chunk contains only one unique activity label
                if chunk['label'].nunique() == 1 and len(chunk) == chunk_size:
                    back_x = torch.tensor(chunk['back_x'].values, dtype=torch.float32)
                    back_y = torch.tensor(chunk['back_y'].values, dtype=torch.float32)
                    back_z = torch.tensor(chunk['back_z'].values, dtype=torch.float32)

                    # --- Calculate Hand-Crafted Features ---
                    mean_back_x, mean_back_y, mean_back_z = back_x.mean(), back_y.mean(), back_z.mean()
                    std_back_x, std_back_y, std_back_z = back_x.std(), back_y.std(), back_z.std()
                    back_sma = torch.sum(back_x.abs() + back_y.abs() + back_z.abs()) / chunk_size

                    # FFT with NumPy
                    back_fft_x = torch.tensor(np.abs(fft(back_x.numpy()))[:10], dtype=torch.float32)
                    back_fft_y = torch.tensor(np.abs(fft(back_y.numpy()))[:10], dtype=torch.float32)
                    back_fft_z = torch.tensor(np.abs(fft(back_z.numpy()))[:10], dtype=torch.float32)

                    # Same thing for thigh acc
                    thigh_x = torch.tensor(chunk['thigh_x'].values, dtype=torch.float32)
                    thigh_y = torch.tensor(chunk['thigh_y'].values, dtype=torch.float32)
                    thigh_z = torch.tensor(chunk['thigh_z'].values, dtype=torch.float32)

                    mean_thigh_x, mean_thigh_y, mean_thigh_z = thigh_x.mean(), thigh_y.mean(), thigh_z.mean()
                    std_thigh_x, std_thigh_y, std_thigh_z = thigh_x.std(), thigh_y.std(), thigh_z.std()
                    thigh_sma = torch.sum(thigh_x.abs() + thigh_y.abs() + thigh_z.abs()) / chunk_size

                    thigh_fft_x = torch.tensor(np.abs(fft(thigh_x.numpy()))[:10], dtype=torch.float32)
                    thigh_fft_y = torch.tensor(np.abs(fft(thigh_y.numpy()))[:10], dtype=torch.float32)
                    thigh_fft_z = torch.tensor(np.abs(fft(thigh_z.numpy()))[:10], dtype=torch.float32)

                    features = torch.cat([
                        torch.tensor([mean_back_x, mean_back_y, mean_back_z]),
                        torch.tensor([mean_thigh_x, mean_thigh_y, mean_thigh_z]),
                        torch.tensor([std_back_x, std_back_y, std_back_z]),
                        torch.tensor([std_thigh_x, std_thigh_y, std_thigh_z]),
                        torch.tensor([back_sma]),
                        torch.tensor([thigh_sma]),
                        back_fft_x,
                        back_fft_y,
                        back_fft_z,
                        thigh_fft_x,
                        thigh_fft_y,
                        thigh_fft_z,
                    ])

                    univariate = torch.concat([back_x, back_y, back_z, thigh_x, thigh_y, thigh_z])
                    self.sessions.append(univariate)
                    self.labels.append(chunk['label'].iloc[0])
                    self.features.append(features)


    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        session = self.sessions[idx]
        label = self.labels[idx]
        feature = self.features[idx]
        return session, feature, label

In [3]:
train_dataset = HARTHDataset(["S006.csv", "S009.csv", "S010.csv", "S012.csv", "S013.csv", "S014.csv", "S015.csv", "S020.csv", "S021.csv", "S022.csv", "S023.csv", "S024.csv", "S025.csv", "S026.csv", "S027.csv", "S028.csv", "S029.csv"])
test_dataset = HARTHDataset(["S008.csv", "S016.csv", "S017.csv", "S018.csv", "S019.csv"])

Processing S006.csv
Processing S009.csv
Processing S010.csv
Processing S012.csv
Processing S013.csv
Processing S014.csv
Processing S015.csv
Processing S020.csv
Processing S021.csv
Processing S022.csv
Processing S023.csv
Processing S024.csv
Processing S025.csv
Processing S026.csv
Processing S027.csv
Processing S028.csv
Processing S029.csv
Processing S008.csv
Processing S016.csv
Processing S017.csv
Processing S018.csv
Processing S019.csv


In [4]:
train_loader = DataLoader(train_dataset, batch_size=4096, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=4096, shuffle=False)

In [7]:
# Extract Features in Batches Using DataLoader
def extract_features_from_loader(loader, model_pipeline):
    """
    Iterates through a DataLoader, extracts features for each batch
    """
    all_features = []
    handcrafted_features = []
    chronos_embeddings = []
    all_labels = []

    # Loop through the data batch by batch
    for i, (series_batch, features_batch, labels_batch) in enumerate(loader):
        print(f"  Processing batch {i+1}/{len(loader)}...")

        series_list = [s for s in series_batch]

        embeddings_batch, _ = model_pipeline.embed(series_list)

        chronos_features = embeddings_batch.mean(dim=1).numpy()
        combined_features = np.hstack([chronos_features, features_batch.numpy()])

        all_features.append(combined_features)
        handcrafted_features.append(features_batch.numpy())
        chronos_embeddings.append(chronos_features)
        all_labels.append(labels_batch.numpy())

    # returns chronos embeddings, handcrafted features, all features, and all labels
    return np.vstack(chronos_embeddings), np.vstack(handcrafted_features), np.vstack(all_features), np.concatenate(all_labels)

In [8]:
x_train_chronos, x_train_hc, x_train_combined, y_train = extract_features_from_loader(train_loader, pipeline)
x_test_chronos, x_test_hc, x_test_combined, y_test = extract_features_from_loader(test_loader, pipeline)

  Processing batch 1/13...
  Processing batch 2/13...
  Processing batch 3/13...
  Processing batch 4/13...
  Processing batch 5/13...
  Processing batch 6/13...
  Processing batch 7/13...
  Processing batch 8/13...
  Processing batch 9/13...
  Processing batch 10/13...
  Processing batch 11/13...
  Processing batch 12/13...
  Processing batch 13/13...
  Processing batch 1/5...
  Processing batch 2/5...
  Processing batch 3/5...
  Processing batch 4/5...
  Processing batch 5/5...


# RF on Chronos embedding

In [17]:
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(n_estimators=150, n_jobs=-1, class_weight='balanced')
classifier.fit(x_train_chronos, y_train)
y_pred = classifier.predict(x_test_chronos)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)
cm = confusion_matrix(y_test, y_pred)
print(cm)

0.9683051435940677
[[ 1849     3    18     3     0    28     3     0]
 [   27   285     0     0     0     0     0     0]
 [   90     0    65     0     0    92    10     0]
 [  121     0     1    16     0     3     2     0]
 [   77     0     0     0     7     1     0     0]
 [   24     0    26     0     0  1845    25     0]
 [   21     0     0     0     0    18 13681     1]
 [   13     0     0     0     0     8    24  1774]]


# RF on hand crafted features

In [19]:
classifier.fit(x_train_hc, y_train)
y_pred = classifier.predict(x_test_hc)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)
cm = confusion_matrix(y_test, y_pred)
print(cm)

0.9713803878775854
[[ 1841    11    31     5     2    14     0     0]
 [   11   300     0     0     1     0     0     0]
 [   79     0    88     0     0    83     7     0]
 [   93     0     1    46     0     3     0     0]
 [   59     0     0     0    26     0     0     0]
 [   30     0    45     0     0  1844     1     0]
 [   20     0     0     0     0     0 13700     1]
 [    0     0     0     0     0     0    80  1739]]


# RF on combined features

In [20]:
classifier.fit(x_train_combined, y_train)
y_pred = classifier.predict(x_test_combined)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)
cm = confusion_matrix(y_test, y_pred)
print(cm)

0.9723228014483408
[[ 1844     7    20     3     0    27     3     0]
 [   18   294     0     0     0     0     0     0]
 [   84     0    73     0     0    93     7     0]
 [  118     0     0    21     0     3     1     0]
 [   74     0     0     0    10     1     0     0]
 [   28     0    30     0     0  1862     0     0]
 [   21     0     0     0     0     1 13699     0]
 [    5     0     0     0     0     0    14  1800]]
