In [1]:
%matplotlib inline
import os
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [13]:
from pathlib import Path

DATA_DIR = Path("/content/drive/MyDrive/data1")
print(DATA_DIR.exists(), DATA_DIR)

True /content/drive/MyDrive/data1


In [14]:
CLASSES = ["idle", "walking", "running", "stairs"]

In [15]:
def read_acc_csv(fp: Path) -> pd.DataFrame:
    """
    Reads a single accelerometer CSV.
    Tries to infer 3 accelerometer columns (ax, ay, az).
    Works with/without header; drops non-numeric columns.
    """

    df = pd.read_csv(fp)

    # залишаємо лише числові колонки
    num = df.select_dtypes(include=[np.number]).copy()


    if num.shape[1] < 3:
        df2 = pd.read_csv(fp, header=None)
        num2 = df2.select_dtypes(include=[np.number]).copy()
        num = num2

    # беремо перші 3 числові колонки як акселерометр
    if num.shape[1] < 3:
        raise ValueError(f"File {fp} has <3 numeric columns. Can't infer ax,ay,az.")

    num = num.iloc[:, :3].copy()
    num.columns = ["ax", "ay", "az"]

    # приберемо NaN/inf
    num = num.replace([np.inf, -np.inf], np.nan).dropna()

    return num

In [16]:
def load_dataset(data_dir: Path, classes: list[str]) -> pd.DataFrame:
    """
    Loads all CSV files from class folders into a long dataframe:
    columns: [ax, ay, az, label, file_id]
    """
    rows = []
    for label in classes:
        folder = data_dir / label
        if not folder.exists():
            raise FileNotFoundError(f"Folder not found: {folder}")

        files = sorted(folder.glob("*.csv"))
        if len(files) == 0:
            raise FileNotFoundError(f"No CSV files found in {folder}")

        for fp in files:
            sig = read_acc_csv(fp)
            sig["label"] = label
            sig["file_id"] = fp.name
            rows.append(sig)

    return pd.concat(rows, ignore_index=True)

In [30]:
def windowize(df_long: pd.DataFrame, window_size: int = 64, step: int = 32, pad_short: bool = True):
    """
    Splits signals into windows per file.
    If a file is shorter than window_size and pad_short=True, it will be padded to create 1 window.
    Returns:
      X_windows: [n_windows, window_size, 3]
      y_windows: [n_windows]
    """
    X_list, y_list = [], []

    for (label, file_id), part in df_long.groupby(["label", "file_id"]):
        arr = part[["ax", "ay", "az"]].values
        n = arr.shape[0]

        if n < window_size:
            if not pad_short:
                continue
            # pad by repeating last value
            pad_len = window_size - n
            pad_block = np.repeat(arr[-1:, :], repeats=pad_len, axis=0)
            w = np.vstack([arr, pad_block])
            X_list.append(w)
            y_list.append(label)
            continue

        for start in range(0, n - window_size + 1, step):
            w = arr[start:start + window_size]
            X_list.append(w)
            y_list.append(label)

    return np.array(X_list, dtype=float), np.array(y_list)

In [31]:
def _entropy(x: np.ndarray, bins: int = 16) -> float:
    """Shannon entropy of a 1D signal (hist-based)."""
    hist, _ = np.histogram(x, bins=bins, density=True)
    hist = hist[hist > 0]
    if len(hist) == 0:
        return 0.0
    return float(-np.sum(hist * np.log2(hist)))

def _zero_cross_rate(x: np.ndarray) -> float:
    """Zero-crossing rate."""
    return float(np.mean((x[:-1] * x[1:]) < 0))

def _energy(x: np.ndarray) -> float:
    """Mean energy of a 1D signal."""
    return float(np.mean(x ** 2))

def _sma(ax: np.ndarray, ay: np.ndarray, az: np.ndarray) -> float:
    """Signal magnitude area."""
    return float(np.mean(np.abs(ax) + np.abs(ay) + np.abs(az)))

def extract_features_simple(Xw: np.ndarray) -> pd.DataFrame:
    """
    Simple features: mean/std for each axis + magnitude.
    Xw: [n_windows, window, 3]
    """
    feats = []
    for w in Xw:
        ax, ay, az = w[:, 0], w[:, 1], w[:, 2]
        mag = np.sqrt(ax**2 + ay**2 + az**2)

        f = {
            "ax_mean": np.mean(ax), "ax_std": np.std(ax),
            "ay_mean": np.mean(ay), "ay_std": np.std(ay),
            "az_mean": np.mean(az), "az_std": np.std(az),
            "mag_mean": np.mean(mag), "mag_std": np.std(mag),
        }
        feats.append(f)
    return pd.DataFrame(feats)

def extract_features_tdf(Xw: np.ndarray) -> pd.DataFrame:
    """
    Time-domain features (richer):
    - mean, std, min, max, median, iqr, mad, energy, entropy, zcr
    - magnitude features
    - SMA
    - correlations between axes
    """
    feats = []
    for w in Xw:
        ax, ay, az = w[:, 0], w[:, 1], w[:, 2]
        mag = np.sqrt(ax**2 + ay**2 + az**2)

        def stats(prefix: str, x: np.ndarray) -> dict:
            q75, q25 = np.percentile(x, [75, 25])
            iqr = q75 - q25
            med = np.median(x)
            mad = np.median(np.abs(x - med))
            return {
                f"{prefix}_mean": float(np.mean(x)),
                f"{prefix}_std": float(np.std(x)),
                f"{prefix}_min": float(np.min(x)),
                f"{prefix}_max": float(np.max(x)),
                f"{prefix}_median": float(med),
                f"{prefix}_iqr": float(iqr),
                f"{prefix}_mad": float(mad),
                f"{prefix}_energy": _energy(x),
                f"{prefix}_entropy": _entropy(x),
                f"{prefix}_zcr": _zero_cross_rate(x),
            }

        f = {}
        f.update(stats("ax", ax))
        f.update(stats("ay", ay))
        f.update(stats("az", az))
        f.update(stats("mag", mag))

        # SMA
        f["sma"] = _sma(ax, ay, az)

        # Correlations
        def safe_corr(a, b):
            if np.std(a) < 1e-12 or np.std(b) < 1e-12:
                return 0.0
            return float(np.corrcoef(a, b)[0, 1])

        f["corr_ax_ay"] = safe_corr(ax, ay)
        f["corr_ax_az"] = safe_corr(ax, az)
        f["corr_ay_az"] = safe_corr(ay, az)

        feats.append(f)

    return pd.DataFrame(feats)

In [32]:
def evaluate_model(name: str, model, X_train, y_train, X_test, y_test, labels_order):
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    print("=" * 80)
    print(name)
    print("=" * 80)
    print(classification_report(y_test, pred, target_names=labels_order))

    cm = confusion_matrix(y_test, pred, labels=labels_order)
    cm_df = pd.DataFrame(cm, index=[f"true_{c}" for c in labels_order],
                         columns=[f"pred_{c}" for c in labels_order])
    display(cm_df)

    acc = accuracy_score(y_test, pred)
    f1_macro = f1_score(y_test, pred, average="macro")
    f1_weighted = f1_score(y_test, pred, average="weighted")

    return {"model": name, "accuracy": acc, "f1_macro": f1_macro, "f1_weighted": f1_weighted}

In [33]:
df_long = load_dataset(DATA_DIR, CLASSES)
display(df_long.head())
print("Loaded rows:", len(df_long), "| files:", df_long["file_id"].nunique())

Unnamed: 0,ax,ay,az,label,file_id
0,1.000776,4.616021,8.576031,idle,idle-1.csv
1,0.718261,4.209007,8.446744,idle,idle-1.csv
2,-0.909797,-0.282516,9.203311,idle,idle-1.csv
3,5.09965,0.148441,8.418014,idle,idle-1.csv
4,1.762132,-0.162806,9.251195,idle,idle-1.csv


Loaded rows: 193860 | files: 6462


In [34]:
WINDOW = 64
STEP = 32

Xw, yw = windowize(df_long, window_size=WINDOW, step=STEP, pad_short=True)
print("Windows:", Xw.shape, "Labels:", yw.shape)
print("Label distribution:\n", pd.Series(yw).value_counts())

Windows: (6462, 64, 3) Labels: (6462,)
Label distribution:
 running    3408
walking    1850
idle       1039
stairs      165
Name: count, dtype: int64


In [35]:
X_simple = extract_features_simple(Xw)
X_tdf = extract_features_tdf(Xw)

print("Simple features:", X_simple.shape)
print("TDF features:", X_tdf.shape)

Simple features: (6462, 8)
TDF features: (6462, 44)


In [36]:
Xtr_s, Xte_s, ytr, yte = train_test_split(X_simple, yw, test_size=0.2, random_state=42, stratify=yw)
Xtr_t, Xte_t, _, _ = train_test_split(X_tdf,   yw, test_size=0.2, random_state=42, stratify=yw)

labels_order = CLASSES

In [37]:
svm = Pipeline([
    ("scaler", StandardScaler()),
    ("svc", SVC(kernel="rbf", C=10, gamma="scale", class_weight="balanced", random_state=42))
])

rf = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    class_weight="balanced",
    n_jobs=-1
)

In [38]:
results = []
results.append(evaluate_model("SVM (simple features)", svm, Xtr_s, ytr, Xte_s, yte, labels_order))
results.append(evaluate_model("RandomForest (simple features)", rf, Xtr_s, ytr, Xte_s, yte, labels_order))

SVM (simple features)
              precision    recall  f1-score   support

        idle       1.00      1.00      1.00       208
     walking       1.00      1.00      1.00       682
     running       0.49      0.85      0.62        33
      stairs       0.98      0.92      0.95       370

    accuracy                           0.97      1293
   macro avg       0.87      0.94      0.89      1293
weighted avg       0.98      0.97      0.98      1293



Unnamed: 0,pred_idle,pred_walking,pred_running,pred_stairs
true_idle,208,0,0,0
true_walking,0,340,1,29
true_running,0,1,681,0
true_stairs,0,5,0,28


RandomForest (simple features)
              precision    recall  f1-score   support

        idle       1.00      1.00      1.00       208
     walking       1.00      1.00      1.00       682
     running       0.87      0.39      0.54        33
      stairs       0.95      0.99      0.97       370

    accuracy                           0.98      1293
   macro avg       0.95      0.85      0.88      1293
weighted avg       0.98      0.98      0.98      1293



Unnamed: 0,pred_idle,pred_walking,pred_running,pred_stairs
true_idle,208,0,0,0
true_walking,0,365,3,2
true_running,0,0,682,0
true_stairs,0,20,0,13


In [39]:
results.append(evaluate_model("SVM (time-domain features)", svm, Xtr_t, ytr, Xte_t, yte, labels_order))
results.append(evaluate_model("RandomForest (time-domain features)", rf, Xtr_t, ytr, Xte_t, yte, labels_order))

SVM (time-domain features)
              precision    recall  f1-score   support

        idle       1.00      1.00      1.00       208
     walking       1.00      1.00      1.00       682
     running       0.79      1.00      0.88        33
      stairs       1.00      0.98      0.99       370

    accuracy                           0.99      1293
   macro avg       0.95      0.99      0.97      1293
weighted avg       0.99      0.99      0.99      1293



Unnamed: 0,pred_idle,pred_walking,pred_running,pred_stairs
true_idle,208,0,0,0
true_walking,0,361,0,9
true_running,0,0,682,0
true_stairs,0,0,0,33


RandomForest (time-domain features)
              precision    recall  f1-score   support

        idle       1.00      1.00      1.00       208
     walking       1.00      1.00      1.00       682
     running       1.00      0.91      0.95        33
      stairs       0.99      1.00      1.00       370

    accuracy                           1.00      1293
   macro avg       1.00      0.98      0.99      1293
weighted avg       1.00      1.00      1.00      1293



Unnamed: 0,pred_idle,pred_walking,pred_running,pred_stairs
true_idle,208,0,0,0
true_walking,0,370,0,0
true_running,0,0,682,0
true_stairs,0,3,0,30


In [40]:
res_df = pd.DataFrame(results).sort_values(by="f1_macro", ascending=False)
display(res_df)

Unnamed: 0,model,accuracy,f1_macro,f1_weighted
3,RandomForest (time-domain features),0.99768,0.987086,0.997629
2,SVM (time-domain features),0.993039,0.966922,0.993414
0,SVM (simple features),0.972158,0.892619,0.975197
1,RandomForest (simple features),0.980665,0.87659,0.977669


In [41]:
print("Model ranking by macro F1 (main metric):")
display(res_df)


Model ranking by macro F1 (main metric):


Unnamed: 0,model,accuracy,f1_macro,f1_weighted
3,RandomForest (time-domain features),0.99768,0.987086,0.997629
2,SVM (time-domain features),0.993039,0.966922,0.993414
0,SVM (simple features),0.972158,0.892619,0.975197
1,RandomForest (simple features),0.980665,0.87659,0.977669


## Висновки

Ми порівняли 2 моделі (SVM та RandomForest) на 2 наборах ознак:
1) **Simple features** (mean/std по осях та magnitude)
2) **Time-domain features** (розширені статистики: median, IQR, MAD, energy, entropy, ZCR, SMA, кореляції)

### Порівняння за classification_report
- На **simple features** якість нижча саме для класу **running** (менший support та часті плутанини).
- На **time-domain features** обидві моделі суттєво покращились; найкращий результат показав **RandomForest (time-domain features)**.

### Підсумок
Для цього датасету розширені **time-domain features** дають відчутний приріст якості (особливо за macro F1), а RandomForest на цих ознаках працює найстабільніше.
