In [1]:
import os, gc, json, warnings
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupShuffleSplit
from tqdm import tqdm
from collections import defaultdict
warnings.filterwarnings('ignore')

import torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
from sktime.classification.kernel_based._rocket_classifier import RocketClassifier

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device:', device)
WIN  = 128
STEP = 64
ROCKET_KERNELS = 10000

Device: cpu


In [6]:
data_dir = Path('data')
train_dir = data_dir / 'train'
meta_file = data_dir / 'meta_data.txt'
test_file = data_dir/'test.csv'
label_map = {
    'null': 0,'jogging': 1,'jogging (rotating arms)': 2,'jogging (skipping)': 3,'jogging (sidesteps)': 4,'jogging (butt-kicks)': 5,
    'stretching (triceps)': 6,'stretching (lunging)': 7,'stretching (shoulders)': 8,'stretching (hamstrings)': 9,'stretching (lumbar rotation)': 10,
    'push-ups': 11,'push-ups (complex)': 12,'sit-ups': 13,'sit-ups (complex)': 14,'burpees': 15,'lunges': 16,'lunges (complex)': 17,'bench-dips': 18
}
num_classes = len(label_map)
C = 3
crit = nn.CrossEntropyLoss()
print(len(label_map))

19


In [7]:
frames = []
for f in sorted(train_dir.glob('sbj_*.csv')):
    df = pd.read_csv(f, low_memory=False)
    df['subject'] = df['sbj_id'].astype(str)
    frames.append(df)
raw = pd.concat(frames, ignore_index=True)
print('Shape raw:', raw.shape)

raw['label_code'] = raw['label'].map(label_map)
raw = raw.dropna(subset=['label_code']).reset_index(drop=True)
raw['label_code'] = raw['label_code'].astype(int)

sensor_cols = [c for c in raw.columns if c.endswith(('_x','_y','_z'))]
locs = sorted({c[:-2] for c in sensor_cols})
ax_order = ['_x','_y','_z']
loc_axes_cols = {loc:[f'{loc}{a}' for a in ax_order] for loc in locs}
print('Locations:', locs)

before = len(raw)
raw = raw.dropna(subset=sensor_cols).reset_index(drop=True)
print(f'Remove NaN‑rows in sensors: {before-len(raw)} rows dropped')

scaler = StandardScaler()
raw[sensor_cols] = scaler.fit_transform(raw[sensor_cols])
raw['time_idx'] = raw.groupby('subject').cumcount()
raw

Shape raw: (3466400, 15)
Locations: ['left_arm_acc', 'left_leg_acc', 'right_arm_acc', 'right_leg_acc']
Remove NaN‑rows in sensors: 34274 rows dropped


Unnamed: 0,sbj_id,right_arm_acc_x,right_arm_acc_y,right_arm_acc_z,right_leg_acc_x,right_leg_acc_y,right_leg_acc_z,left_leg_acc_x,left_leg_acc_y,left_leg_acc_z,left_arm_acc_x,left_arm_acc_y,left_arm_acc_z,label,subject,label_code,time_idx
0,0,0.338590,0.120549,0.717712,0.294029,0.120450,0.139363,0.486923,-0.235985,-0.023205,-0.515422,0.331682,0.323862,jogging,0,1,0
1,0,0.267886,0.133180,0.748476,0.297809,0.085421,0.120335,0.541998,-0.236344,0.110817,-0.478973,0.336796,0.330691,jogging,0,1,1
2,0,0.271598,0.158237,0.777007,0.295387,0.078235,0.098782,0.701439,-0.146195,0.360308,-0.411302,0.327018,0.366185,jogging,0,1,2
3,0,0.346738,0.196146,0.795026,0.296009,0.080996,0.106860,1.079330,-0.066705,0.793745,-0.349172,0.311192,0.397661,jogging,0,1,3
4,0,0.423578,0.231633,0.814875,0.303610,0.072211,0.168760,0.928822,-0.216602,0.634914,-0.295446,0.247657,0.381219,jogging,0,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2054759,9,0.660601,-0.674779,0.036996,-0.878534,1.191544,0.817278,-0.760971,0.802049,1.221196,-0.660437,-0.716009,0.319617,bench-dips,9,18,87485
2054760,9,0.671317,-0.672684,0.074857,-0.849709,1.158368,0.741078,-0.755161,0.787947,1.301947,-0.663010,-0.723955,0.356210,bench-dips,9,18,87486
2054761,9,0.674111,-0.697452,0.119595,-0.807822,1.139071,0.600962,-0.761807,0.774746,1.326825,-0.662831,-0.700415,0.299180,bench-dips,9,18,87487
2054762,9,0.660443,-0.734104,0.114785,-0.767200,1.132643,0.498166,-0.780179,0.776143,1.301909,-0.663853,-0.702628,0.306702,bench-dips,9,18,87488


In [8]:
dfs_long = []
for loc in locs:
    cols = loc_axes_cols[loc]
    df_loc = raw[['subject','time_idx','label_code'] + cols].copy()
    df_loc = df_loc.rename(columns={cols[0]: 'x_axis', cols[1]: 'y_axis', cols[2]: 'z_axis'})
    df_loc['sensor_location'] = loc
    df_loc['sbj_id'] = df_loc['subject']
    df_loc['id'] = df_loc['time_idx']
    dfs_long.append(df_loc[['id','sbj_id','sensor_location','x_axis','y_axis','z_axis','label_code']])
long_df = pd.concat(dfs_long, ignore_index=True)
print('Langformat Trainingsdaten:', long_df.shape)
print('Beispiel:', long_df.head())

scaler = StandardScaler()
raw[sensor_cols] = scaler.fit_transform(raw[sensor_cols])
raw

Langformat Trainingsdaten: (8219056, 7)
Beispiel:    id sbj_id sensor_location    x_axis    y_axis    z_axis  label_code
0   0      0    left_arm_acc -0.515422  0.331682  0.323862           1
1   1      0    left_arm_acc -0.478973  0.336796  0.330691           1
2   2      0    left_arm_acc -0.411302  0.327018  0.366185           1
3   3      0    left_arm_acc -0.349172  0.311192  0.397661           1
4   4      0    left_arm_acc -0.295446  0.247657  0.381219           1


Unnamed: 0,sbj_id,right_arm_acc_x,right_arm_acc_y,right_arm_acc_z,right_leg_acc_x,right_leg_acc_y,right_leg_acc_z,left_leg_acc_x,left_leg_acc_y,left_leg_acc_z,left_arm_acc_x,left_arm_acc_y,left_arm_acc_z,label,subject,label_code,time_idx
0,0,0.338590,0.120549,0.717712,0.294029,0.120450,0.139363,0.486923,-0.235985,-0.023205,-0.515422,0.331682,0.323862,jogging,0,1,0
1,0,0.267886,0.133180,0.748476,0.297809,0.085421,0.120335,0.541998,-0.236344,0.110817,-0.478973,0.336796,0.330691,jogging,0,1,1
2,0,0.271598,0.158237,0.777007,0.295387,0.078235,0.098782,0.701439,-0.146195,0.360308,-0.411302,0.327018,0.366185,jogging,0,1,2
3,0,0.346738,0.196146,0.795026,0.296009,0.080996,0.106860,1.079330,-0.066705,0.793745,-0.349172,0.311192,0.397661,jogging,0,1,3
4,0,0.423578,0.231633,0.814875,0.303610,0.072211,0.168760,0.928822,-0.216602,0.634914,-0.295446,0.247657,0.381219,jogging,0,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2054759,9,0.660601,-0.674779,0.036996,-0.878534,1.191544,0.817278,-0.760971,0.802049,1.221196,-0.660437,-0.716009,0.319617,bench-dips,9,18,87485
2054760,9,0.671317,-0.672684,0.074857,-0.849709,1.158368,0.741078,-0.755161,0.787947,1.301947,-0.663010,-0.723955,0.356210,bench-dips,9,18,87486
2054761,9,0.674111,-0.697452,0.119595,-0.807822,1.139071,0.600962,-0.761807,0.774746,1.326825,-0.662831,-0.700415,0.299180,bench-dips,9,18,87487
2054762,9,0.660443,-0.734104,0.114785,-0.767200,1.132643,0.498166,-0.780179,0.776143,1.301909,-0.663853,-0.702628,0.306702,bench-dips,9,18,87488


In [9]:
X_all, y_all, subj_all, loc_all = [], [], [], []
for subj_id, sub_df in tqdm(raw.groupby('subject')):
    for loc in locs:
        cols = loc_axes_cols[loc]
        if not all(c in sub_df.columns for c in cols):
            continue
        sub_loc = sub_df.dropna(subset=cols).reset_index(drop=True)
        if len(sub_loc) < WIN:
            continue
        data = sub_loc[cols].values  # (T_loc, 3)
        for s in range(0, len(data) - WIN + 1, STEP):
            win = data[s:s+WIN]
            X_all.append(win)
            y_all.append(sub_loc['label_code'].iloc[s:s+WIN].mode()[0])
            subj_all.append(subj_id)
            loc_all.append(locs.index(loc))
# Arrays erzeugen
X_all = np.stack(X_all)
y_all = np.array(y_all)
loc_all = np.array(loc_all)
X_all


100%|██████████| 22/22 [00:19<00:00,  1.12it/s]


array([[[-0.51542226,  0.3316824 ,  0.32386189],
        [-0.47897283,  0.33679623,  0.33069118],
        [-0.41130215,  0.32701837,  0.36618523],
        ...,
        [ 1.20288759, -0.53217809,  0.24055676],
        [ 1.12929376, -0.45804168,  0.4403317 ],
        [ 1.11882087, -0.27598673,  0.71401951]],

       [[-3.89590723,  4.75640814,  1.77220168],
        [-3.09596045,  5.30652886,  1.14170253],
        [-2.37791246,  4.65586008,  0.92890985],
        ...,
        [-0.3646925 , -1.52475698,  0.8257138 ],
        [-0.2237099 , -1.45106992,  0.74591084],
        [-0.16218621, -1.14542911,  0.55239553]],

       [[ 1.21125051,  0.40219823,  0.91554007],
        [ 0.33482915,  4.26295678,  2.44218778],
        [-0.96748268,  6.59998852,  3.62361624],
        ...,
        [ 1.20592763, -0.3263123 ,  0.55278531],
        [ 1.20457124, -0.41138996,  0.42054981],
        [ 1.13510143, -0.50261776,  0.32443833]],

       ...,

       [[-0.70472747,  1.1446685 ,  0.44916532],
        [-0

In [11]:
class DeepConvLSTM(nn.Module):
    def __init__(self, in_ch=C, classes=num_classes, hidden=128, conv_channels=64, dropout=0.2):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv1d(in_ch, conv_channels, 5, padding=2), nn.ReLU(),nn.Dropout(dropout),
            nn.Conv1d(conv_channels, conv_channels, 5, padding=2), nn.ReLU(),nn.Dropout(dropout),
            nn.Conv1d(conv_channels, conv_channels, 5, padding=2), nn.ReLU(),nn.Dropout(dropout),
            nn.Conv1d(conv_channels, conv_channels, 5, padding=2), nn.ReLU()
        )
        self.lstm = nn.LSTM(conv_channels, hidden, num_layers=2, batch_first=True, dropout=dropout)
        self.fc   = nn.Linear(hidden, classes)

    def forward(self, x):
        x = self.conv(x)
        x = x.permute(0,2,1)
        out,_ = self.lstm(x)
        return self.fc(out[:,-1])


In [1]:
locs

NameError: name 'locs' is not defined

In [13]:
from torch.utils.data import Dataset, DataLoader, Subset

class LocationWindowDataset(Dataset):
    def __init__(self, df_loc, win, step):
        self.df = df_loc.reset_index(drop=True)
        self.win = win
        self.step = step
        self.starts = list(range(0, len(self.df) - win + 1, step))
    def __len__(self):
        return len(self.starts)
    def __getitem__(self, idx):
        s = self.starts[idx]
        window = self.df.iloc[s:s+self.win]
        x = window[['x_axis','y_axis','z_axis']].values.T.astype(np.float32)
        y = window['label_code'].mode()[0]
        grp = window['sbj_id'].iloc[0]
        return torch.from_numpy(x), int(y), grp

conv_models = {}
rocket_models = {}
conv_scores = {}
rocket_scores = {}

In [None]:
for loc in locs:
    df_loc = long_df[long_df['sensor_location']==loc]
    if df_loc.empty:
        print(f"Keine Daten für {loc}, überspringe.")
        continue

    ds = LocationWindowDataset(df_loc, WIN, STEP)
    n = len(ds)
    groups = [ds[i][2] for i in range(n)]
    labels = [ds[i][1] for i in range(n)]

    gss = GroupShuffleSplit(test_size=0.2, n_splits=1, random_state=42)
    train_idx, val_idx = next(gss.split(range(n), labels, groups))
    train_ds = Subset(ds, train_idx)
    val_ds   = Subset(ds, val_idx)

    tr_ld = DataLoader(train_ds, batch_size=64, shuffle=True)
    va_ld = DataLoader(val_ds,   batch_size=64)

    model = DeepConvLSTM_Single(in_ch=3).to(device)
    opt = torch.optim.Adam(model.parameters(), lr=1e-3)
    for epoch in range(1,6):
        model.train(); total_loss=0
        for xb,yb,_ in tr_ld:
            xb,yb = xb.to(device), yb.to(device)
            opt.zero_grad()
            loss = crit(model(xb), yb)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            opt.step()
            total_loss += loss.item()*yb.size(0)
        model.eval(); correct=0
        with torch.no_grad():
            for xb,yb,_ in va_ld:
                xb,yb = xb.to(device), yb.to(device)
                pred = model(xb).argmax(1)
                correct += (pred==yb).sum().item()
        acc = correct/len(val_ds)
    conv_models[loc] = model
    conv_scores[loc] = acc
    print(f"ConvLSTM {loc} VAL Acc: {acc:.3f}")

    def build_panel(idxs):
        pnl, lbls = [], []
        for i in idxs:
            x,y,_ = ds[i]
            pnl.append({'x':pd.Series(x[0].numpy()), 'y':pd.Series(x[1].numpy()), 'z':pd.Series(x[2].numpy())})
            lbls.append(y)
        return pd.DataFrame(pnl), np.array(lbls)

    Xp_tr, y_tr = build_panel(train_idx)
    Xp_vl, y_vl = build_panel(val_idx)
    rc = RocketClassifier(num_kernels=ROCKET_KERNELS, random_state=42)
    rc.fit(Xp_tr, y_tr)
    preds = rc.predict(Xp_vl)
    rocket_models[loc] = rc
    rocket_scores[loc] = (preds==y_vl).mean()
    print(f"Rocket {loc} VAL Acc: {rocket_scores[loc]:.3f}")

    del train_ds, val_ds, tr_ld, va_ld, model, rc, Xp_tr, Xp_vl, preds
    gc.collect()
    if torch.cuda.is_available(): torch.cuda.empty_cache()


In [None]:
for loc in locs:
    df_loc = long_df[long_df['sensor_location']==loc].reset_index(drop=True)
    if df_loc.empty:
        print(f"Keine Daten für {loc}, überspringe.")
        continue
    Xw, yw, subj_w = [], [], []
    for subj, sub in df_loc.groupby('sbj_id'):
        data = sub[['x_axis','y_axis','z_axis']].values
        for s in range(0, len(data)-WIN+1, STEP):
            win = data[s:s+WIN]
            Xw.append(win)
            yw.append(sub['label_code'].iloc[s:s+WIN].mode()[0])
            subj_w.append(subj)
    Xw = np.array(Xw)  # (N_w, WIN, 3)
    yw = np.array(yw)
    subj_w = np.array(subj_w)
    Xc = torch.tensor(Xw.transpose(0,2,1), dtype=torch.float32)
    yc = torch.tensor(yw, dtype=torch.long)
    gss = GroupShuffleSplit(test_size=0.2, n_splits=1, random_state=42)
    ti, vi = next(gss.split(Xc, yc, groups=subj_w))
    tr_ds = TensorDataset(Xc[ti], yc[ti])
    va_ds = TensorDataset(Xc[vi], yc[vi])
    tr_ld = DataLoader(tr_ds, batch_size=128, shuffle=True)
    va_ld = DataLoader(va_ds, batch_size=128)
    model = DeepConvLSTM().to(device)
    opt = torch.optim.Adam(model.parameters(), lr=1e-3)
    print("training...")
    for ep in range(1,3):
        model.train(); tl=0
        for xb,yb in tr_ld:
            xb,yb = xb.to(device), yb.to(device)
            opt.zero_grad()
            loss = crit(model(xb), yb)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(),1.0)
            opt.step()
            tl+=loss.item()*yb.size(0)
        model.eval(); vl=0; corr=0
        with torch.no_grad():
            for xb,yb in va_ld:
                xb,yb = xb.to(device), yb.to(device)
                out = model(xb)
                vl+=crit(out,yb).item()*yb.size(0)
                corr+=(out.argmax(1)==yb).sum().item()
        acc= corr/len(va_ds)
        if ep==5:
            conv_scores[loc] = acc
    conv_models[loc] = model
    print(f"ConvLSTM {loc} Val Acc: {acc:.3f}")
    # ROCKET
    panel = []
    for win in Xw:
        panel.append({'x':pd.Series(win[:,0]), 'y':pd.Series(win[:,1]), 'z':pd.Series(win[:,2])})
    Xp = pd.DataFrame(panel)
    y = yw
    ti, vi = next(gss.split(Xp, y, groups=subj_w))
    rc = RocketClassifier(num_kernels=ROCKET_KERNELS)
    rc.fit(Xp.iloc[ti], y[ti])
    preds = rc.predict(Xp.iloc[vi])
    score = (preds==y[vi]).mean()
    rocket_scores[loc] = score
    rocket_models[loc] = rc
    print(f"ROCKET {loc} Val Acc: {score:.3f}")

training...
ConvLSTM left_arm_acc Val Acc: 0.381


In [None]:
print("Rocket Scores:", rocket_scores)

import joblib
for loc, m in rocket_models.items():
    joblib.dump(m, f"rocket_model_{loc}.pkl")
for loc, m in conv_models.items():
    torch.save(m.state_dict(), f"conv_model_{loc}.pt")
print("Modelle gespeichert.")

In [9]:
#!python -c "import sklearn; print(sklearn.__version__)"
#!pip uninstall -y scikit-learn sktime
!pip install scikit-learn==1.3.2
!pip install sktime==0.37.0



In [1]:
test_df = pd.read_csv(test_file)
ids, acts = [], []
for idx, grp in test_df.groupby('id'):
    loc = grp['sensor_location'].iloc[0]
    ser = {'x':pd.Series(grp['x_axis'].values), 'y':pd.Series(grp['y_axis'].values), 'z':pd.Series(grp['z_axis'].values)}
    pred = rocket_models[loc].predict(pd.DataFrame([ser]))[0]
    ids.append(idx); acts.append(pred)
sub = pd.DataFrame({'id':ids,'target_value':acts})
sub.to_csv('submission_rocket.csv', index=False)
display(sub.head())

NameError: name 'pd' is not defined