<a href="https://colab.research.google.com/github/ThangNguyen2812/Cos4007_Portfolio_week3/blob/main/Portfolio3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Step 1 — Data Collection**

In [2]:
import pandas as pd
import numpy as np

# Load CSVs
boning = pd.read_csv("/content/data/Boning.csv")
slicing = pd.read_csv("/content/data/Slicing.csv")

print("Boning shape:", boning.shape)
print("Slicing shape:", slicing.shape)

# -----------------------------
# Choose sensor sets (based on student number last digit)
last_digit = 0  # <-- change this to your own
mapping = {
    0: ("Neck", "Head"),
    1: ("RightShoulder", "LeftShoulder"),
    2: ("RightUpperArm", "LeftUpperArm"),
    3: ("RightForearm", "LeftForearm"),
    4: ("RightHand", "LeftHand"),
    5: ("RightUpperLeg", "LeftUpperLeg"),
    6: ("RightLowerLeg", "LeftLowerLeg"),
    7: ("RightFoot", "LeftFoot"),
    8: ("RightToe", "LeftToe"),
    9: ("L5", "T12"),
}
sensor1, sensor2 = mapping[last_digit]
print(f"Using sensors: {sensor1}, {sensor2}")

# -----------------------------
# Helper: find x,y,z columns for a given sensor name
def find_xyz(df, sensor_name):
    cols = [c for c in df.columns if sensor_name.lower() in c.lower()]
    if len(cols) < 3:
        raise ValueError(f"Could not find full x,y,z for {sensor_name}")
    return cols[:3]  # assumes first 3 are x,y,z

s1_cols = find_xyz(boning, sensor1)
s2_cols = find_xyz(boning, sensor2)

print("Sensor 1 cols:", s1_cols)
print("Sensor 2 cols:", s2_cols)

# -----------------------------
# Build dataframes with required columns
def build_df(df, s1, s2, label):
    out = pd.DataFrame()
    out["frame"] = np.arange(len(df))   # create frame index
    out[s1] = df[s1]
    out[s2] = df[s2]
    out["class"] = label
    return out


boning_step1 = build_df(boning, s1_cols, s2_cols, 0)
slicing_step1 = build_df(slicing, s1_cols, s2_cols, 1)

data_step1 = pd.concat([boning_step1, slicing_step1], axis=0).reset_index(drop=True)
print("Step 1 shape:", data_step1.shape)
display(data_step1.head())


Boning shape: (54180, 67)
Slicing shape: (17880, 67)
Using sensors: Neck, Head
Sensor 1 cols: ['Neck x', 'Neck y', 'Neck z']
Sensor 2 cols: ['Head x', 'Head y', 'Head z']
Step 1 shape: (72060, 8)


Unnamed: 0,frame,Neck x,Neck y,Neck z,Head x,Head y,Head z,class
0,0,0.207796,0.127939,-0.17513,0.376399,0.202993,-0.182585,0
1,1,-0.006589,0.356974,0.286768,0.204439,0.521502,0.198235,0
2,2,0.112606,0.043502,0.104975,0.021196,0.19739,0.165812,0
3,3,-0.031866,0.037024,0.131005,-0.157759,0.118886,0.201893,0
4,4,0.135369,0.019024,0.11565,0.011714,0.096737,0.107186,0


**Step 2 — Create Composite Columnsbold**

In [3]:
from math import sqrt, atan2, pi

def compute_composites(df, x, y, z, prefix):
    X, Y, Z = df[x].to_numpy(), df[y].to_numpy(), df[z].to_numpy()
    comps = pd.DataFrame({
        f"{prefix}_rms_xy": np.sqrt((X**2 + Y**2)/2),
        f"{prefix}_rms_yz": np.sqrt((Y**2 + Z**2)/2),
        f"{prefix}_rms_zx": np.sqrt((Z**2 + X**2)/2),
        f"{prefix}_rms_xyz": np.sqrt((X**2 + Y**2 + Z**2)/3),
        f"{prefix}_roll": 180 * np.arctan2(Y, np.sqrt(X**2 + Z**2)) / pi,
        f"{prefix}_pitch": 180 * np.arctan2(X, np.sqrt(Y**2 + Z**2)) / pi,
    })
    return comps

# Apply composites to dataset
def add_composites(df, s1, s2):
    df_copy = df.copy()
    df_copy = pd.concat(
        [df_copy,
         compute_composites(df, s1[0], s1[1], s1[2], "s1"),
         compute_composites(df, s2[0], s2[1], s2[2], "s2")],
        axis=1
    )
    return df_copy

data_step2 = add_composites(data_step1, s1_cols, s2_cols)
print("Step 2 shape:", data_step2.shape)
display(data_step2.head())


Step 2 shape: (72060, 20)


Unnamed: 0,frame,Neck x,Neck y,Neck z,Head x,Head y,Head z,class,s1_rms_xy,s1_rms_yz,s1_rms_zx,s1_rms_xyz,s1_roll,s1_pitch,s2_rms_xy,s2_rms_yz,s2_rms_zx,s2_rms_xyz,s2_roll,s2_pitch
0,0,0.207796,0.127939,-0.17513,0.376399,0.202993,-0.182585,0,0.172551,0.15336,0.192158,0.173415,25.210615,43.774063,0.302393,0.193059,0.295816,0.268465,25.883962,54.044176
1,1,-0.006589,0.356974,0.286768,0.204439,0.521502,0.198235,0,0.252462,0.32378,0.202829,0.264392,51.216647,-0.82438,0.396081,0.394501,0.201361,0.343053,61.363191,20.124758
2,2,0.112606,0.043502,0.104975,0.021196,0.19739,0.165812,0,0.08536,0.080349,0.108857,0.092362,15.778909,44.740429,0.140379,0.182286,0.118201,0.149338,49.740164,4.700431
3,3,-0.031866,0.037024,0.131005,-0.157759,0.118886,0.201893,0,0.034541,0.096263,0.095335,0.080723,15.355297,-13.174411,0.139681,0.165672,0.181175,0.163077,24.89121,-33.953692
4,4,0.135369,0.019024,0.11565,0.011714,0.096737,0.107186,0,0.096661,0.082876,0.125896,0.103379,6.098993,49.113563,0.068903,0.102095,0.076243,0.083634,41.897439,4.638284


**Step 3 — Data Pre-processing & Feature Computation**

In [7]:
import numpy as np
import pandas as pd
from scipy.signal import find_peaks

# Copy from Step 2
full = data_step2.copy()

# Columns to compute features for (exclude frame and class)
feature_cols = [c for c in full.columns if c not in ["frame", "class"]]

# Window size = 60 frames (1 minute)
window = 60

def count_peaks(arr):
    """Count number of peaks in an array safely."""
    try:
        peaks, _ = find_peaks(arr)
        return len(peaks)
    except Exception:
        return 0

rows = []
labels = []

num_windows = full.shape[0] // window
for w in range(num_windows):
    start = w * window
    end = start + window
    block = full.iloc[start:end]
    feats = {}
    for col in feature_cols:
        arr = block[col].astype(float).to_numpy()
        feats[f"{col}_mean"] = np.mean(arr)
        feats[f"{col}_std"] = np.std(arr)
        feats[f"{col}_min"] = np.min(arr)
        feats[f"{col}_max"] = np.max(arr)
        feats[f"{col}_auc"] = np.trapz(np.abs(arr))   # AUC (use trapezoid to avoid warnings)
        feats[f"{col}_peaks"] = count_peaks(arr)
    rows.append(feats)
    # Take the class label from the first row of this block
    labels.append(int(block.iloc[0]["class"]))

# Build features dataframe
features = pd.DataFrame(rows)
features["class"] = labels

print("Step 3 features shape:", features.shape)
display(features.head())

# Save for later steps
import os
os.makedirs("submission", exist_ok=True)
features.to_csv("submission/features_per_minute.csv", index=False)
print("Saved to submission/features_per_minute.csv")


Step 3 features shape: (1201, 109)


Unnamed: 0,Neck x_mean,Neck x_std,Neck x_min,Neck x_max,Neck x_auc,Neck x_peaks,Neck y_mean,Neck y_std,Neck y_min,Neck y_max,...,s2_roll_max,s2_roll_auc,s2_roll_peaks,s2_pitch_mean,s2_pitch_std,s2_pitch_min,s2_pitch_max,s2_pitch_auc,s2_pitch_peaks,class
0,0.058447,0.256127,-0.59556,0.653929,11.356785,16,-0.00577,0.268144,-0.613843,0.560261,...,76.785135,2438.283686,13,14.586659,36.77982,-63.205906,80.848958,1956.972442,14,0
1,-0.096646,0.682678,-1.595246,2.575807,27.35723,16,0.009024,0.401579,-1.113462,0.914943,...,68.965599,2003.544901,14,-14.411278,41.752705,-82.903717,72.177073,2276.685617,11,0
2,-0.028674,1.069323,-2.42352,2.001616,50.841302,13,0.091343,0.746604,-1.483868,1.689948,...,85.312889,1983.160138,15,-12.058287,47.965019,-85.010275,79.714435,2571.634725,13,0
3,0.165025,1.0069,-2.14165,2.492493,48.764596,14,-0.277588,1.091049,-2.639818,2.195484,...,82.058709,2490.455061,12,9.807505,35.587808,-67.321659,85.879061,1802.491733,16,0
4,-0.146506,1.338005,-3.68095,3.637092,60.460457,16,0.07601,1.600203,-4.399867,5.590761,...,85.060266,2370.577829,12,-10.568928,42.283748,-81.704511,64.139626,2192.924917,11,0


Saved to submission/features_per_minute.csv


**Step 4: Training**

Train:


*   SVM classifiers with:
*   Train–Test split (70/30)
* 10-fold Cross-validation
* Both with hyperparameter tuning
* Both with tuning + top 10 features (SelectKBest)
* Both with tuning + PCA (10 components)
* Other classifiers: SGD, Random Forest, MLP




**Step 4.1 – Prepare data**

In [21]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
import numpy as np
import pandas as pd

# Load feature dataset (from Step 3)
features = pd.read_csv("submission/features_per_minute.csv")

X = features.drop(columns=["class"])
y = features["class"]

# Train-Test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

print("Training shape:", X_train.shape, "Testing shape:", X_test.shape)


Training shape: (840, 108) Testing shape: (361, 108)


**Step 4.2 – SVM Experiments**

In [32]:
# --- Data already loaded from Step 4.1 ---
# X, y

# Train-Test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

# Define base pipeline
svm_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("svm", SVC())
])

# Define hyperparameter grid for tuning
param_grid = {
    "svm__C": [0.1, 1, 10],
    "svm__gamma": ["scale", 0.01, 0.001],
    "svm__kernel": ["rbf"]
}

# -------------------------------------------------
print("\n1) Train-Test split (70/30)")
svm_pipeline.fit(X_train, y_train)
acc_train_test = svm_pipeline.score(X_test, y_test)
print("Train-Test Accuracy:", round(acc_train_test * 100, 2), "%")

print("\n2) 10-fold cross-validation")
cv_scores = cross_val_score(svm_pipeline, X, y, cv=10)
cv_acc = np.mean(cv_scores)
print("Cross-validation Accuracy:", round(cv_acc * 100, 2), "%")

# -------------------------------------------------
print("\n3) 1 and 2 with hyperparameter tuning")
grid = GridSearchCV(svm_pipeline, param_grid, cv=5, n_jobs=-1)
grid.fit(X_train, y_train)
print("Best Params:", grid.best_params_)
print("Train-Test Accuracy:", round(grid.score(X_test, y_test) * 100, 2), "%")
print("Cross-validation Accuracy:", round(grid.best_score_ * 100, 2), "%")

# -------------------------------------------------
print("\n4) 1 and 2 with hyperparameter tuning and 10 best features")
select_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("select", SelectKBest(score_func=f_classif, k=10)),
    ("svm", SVC())
])
grid_select = GridSearchCV(select_pipeline, param_grid, cv=5, n_jobs=-1)
grid_select.fit(X_train, y_train)
print("Best Params:", grid_select.best_params_)
print("Train-Test Accuracy:", round(grid_select.score(X_test, y_test) * 100, 2), "%")
print("Cross-validation Accuracy:", round(grid_select.best_score_ * 100, 2), "%")

# -------------------------------------------------
print("\n5) 1 and 2 with hyperparameter tuning and 10 principal components")
pca_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("pca", PCA(n_components=10)),
    ("svm", SVC())
])
grid_pca = GridSearchCV(pca_pipeline, param_grid, cv=5, n_jobs=-1)
grid_pca.fit(X_train, y_train)
print("Best Params:", grid_pca.best_params_)
print("Train-Test Accuracy:", round(grid_pca.score(X_test, y_test) * 100, 2), "%")
print("Cross-validation Accuracy:", round(grid_pca.best_score_ * 100, 2), "%")



1) Train-Test split (70/30)
Train-Test Accuracy: 86.98 %

2) 10-fold cross-validation
Cross-validation Accuracy: 86.51 %

3) 1 and 2 with hyperparameter tuning
Best Params: {'svm__C': 10, 'svm__gamma': 0.001, 'svm__kernel': 'rbf'}
Train-Test Accuracy: 86.98 %
Cross-validation Accuracy: 86.67 %

4) 1 and 2 with hyperparameter tuning and 10 best features
Best Params: {'svm__C': 10, 'svm__gamma': 0.01, 'svm__kernel': 'rbf'}
Train-Test Accuracy: 81.16 %
Cross-validation Accuracy: 82.14 %

5) 1 and 2 with hyperparameter tuning and 10 principal components
Best Params: {'svm__C': 10, 'svm__gamma': 0.001, 'svm__kernel': 'rbf'}
Train-Test Accuracy: 86.43 %
Cross-validation Accuracy: 87.14 %


**Step 4.3 – SVM Summary**

In [38]:
svm_activity6 = pd.DataFrame({
    "SVM Model": [
        "Train-Test split (70/30)",
        "10-fold cross-validation",
        "1 and 2 with hyperparameter tuning",
        "1 and 2 with hyperparameter tuning and 10 best features",
        "1 and 2 with hyperparameter tuning and 10 principal components"
    ],
    "Train-Test Split (%)": [
        round(acc_train_test * 100, 2),
        "-",  # not applicable for pure CV
        round(grid.score(X_test, y_test) * 100, 2),
        round(grid_select.score(X_test, y_test) * 100, 2),
        round(grid_pca.score(X_test, y_test) * 100, 2)
    ],
    "10-fold Cross-validation (%)": [
        "-",  # not applicable for pure Train-Test
        round(cv_acc * 100, 2),
        round(grid.best_score_ * 100, 2),
        round(grid_select.best_score_ * 100, 2),
        round(grid_pca.best_score_ * 100, 2)
    ]
})

print("\n=== Studio Activity 6: SVM Summary Table ===")
display(svm_activity6)

# Save
svm_activity6.to_csv("submission/svm_activity6_summary.csv", index=False)
print("Saved svm_activity6_summary.csv")



=== Studio Activity 6: SVM Summary Table ===


Unnamed: 0,SVM Model,Train-Test Split (%),10-fold Cross-validation (%)
0,Train-Test split (70/30),86.98,-
1,10-fold cross-validation,-,86.51
2,1 and 2 with hyperparameter tuning,86.98,86.67
3,1 and 2 with hyperparameter tuning and 10 best...,81.16,82.14
4,1 and 2 with hyperparameter tuning and 10 prin...,86.43,87.14


Saved svm_activity6_summary.csv


**Step 4.4 – Other Models (SGD, Random Forest, MLP)**

In [34]:
# SGD
sgd = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", SGDClassifier(max_iter=1000, tol=1e-3, random_state=42))
])
sgd.fit(X_train, y_train)
sgd_acc = sgd.score(X_test, y_test)

# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_acc = rf.score(X_test, y_test)

# MLP
mlp = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42))
])
mlp.fit(X_train, y_train)
mlp_acc = mlp.score(X_test, y_test)


**Step 4.5 – Other Models Table**

In [40]:
# Recompute CV for all three models
sgd_cv = np.mean(cross_val_score(sgd, X, y, cv=10))
rf_cv = np.mean(cross_val_score(rf, X, y, cv=10))
mlp_cv = np.mean(cross_val_score(mlp, X, y, cv=10))

activity7 = pd.DataFrame({
    "Model": ["SGD", "Random Forest", "MLP"],
    "Train-Test Split (%)": [
        round(sgd_acc * 100, 2),
        round(rf_acc * 100, 2),
        round(mlp_acc * 100, 2)
    ],
    "10-fold Cross-validation (%)": [
        round(sgd_cv * 100, 2),
        round(rf_cv * 100, 2),
        round(mlp_cv * 100, 2)
    ]
})

print("\n=== Studio Activity 7: Other Models Summary Table ===")
display(activity7)

# Save
activity7.to_csv("submission/activity7_summary.csv", index=False)
print("Saved activity7_summary.csv")



=== Studio Activity 7: Other Models Summary Table ===


Unnamed: 0,Model,Train-Test Split (%),10-fold Cross-validation (%)
0,SGD,84.49,82.43
1,Random Forest,84.49,85.43
2,MLP,85.87,84.51


Saved activity7_summary.csv
