In [4]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from scipy.signal import find_peaks
import warnings
warnings.filterwarnings('ignore')

In [5]:
from google.colab import drive
drive.mount('/content/drive/')
dataset_path = '/content/drive/MyDrive/Colab Notebooks/'

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [6]:
STUDENT_NUMBER_ENDING = 9

## DATA COLLECTION

In [7]:
# Define column mappings based on student number ending (corrected column names)
column_mapping = {
    0: (['Neck x', 'Neck y', 'Neck z'], ['Head x', 'Head y', 'Head z']),
    1: (['Right Shoulder x', 'Right Shoulder y', 'Right Shoulder z'],
        ['Left Shoulder x', 'Left Shoulder y', 'Left Shoulder z']),
    2: (['Right Upper Arm x', 'Right Upper Arm y', 'Right Upper Arm z'],
        ['Left Upper Arm x', 'Left Upper Arm y', 'Left Upper Arm z']),
    3: (['Right Forearm x', 'Right Forearm y', 'Right Forearm z'],
        ['Left Forearm x', 'Left Forearm y', 'Left Forearm z']),
    4: (['Right Hand x', 'Right Hand y', 'Right Hand z'],
        ['Left Hand x', 'Left Hand y', 'Left Hand z']),
    5: (['Right Upper Leg x', 'Right Upper Leg y', 'Right Upper Leg z'],
        ['Left Upper Leg x', 'Left Upper Leg y', 'Left Upper Leg z']),
    6: (['Right Lower Leg x', 'Right Lower Leg y', 'Right Lower Leg z'],
        ['Left Lower Leg x', 'Left Lower Leg y', 'Left Lower Leg z']),
    7: (['Right Foot x', 'Right Foot y', 'Right Foot z'],
        ['Left Foot x', 'Left Foot y', 'Left Foot z']),
    8: (['Right Toe x', 'Right Toe y', 'Right Toe z'],
        ['Left Toe x', 'Left Toe y', 'Left Toe z']),
    9: (['L5 x', 'L5 y', 'L5 z'], ['T12 x', 'T12 y', 'T12 z'])
}

In [8]:
# Load the data files
boning_data = pd.read_csv(dataset_path + 'Boning.csv')
slicing_data = pd.read_csv(dataset_path + 'Slicing.csv')

In [9]:
# Add class labels
boning_data['class'] = 0  # boning
slicing_data['class'] = 1  # slicing

# Get the columns for this student
col_set1, col_set2 = column_mapping[STUDENT_NUMBER_ENDING]

# Extract required columns (Frame + column sets + class)
required_columns = ['Frame'] + col_set1 + col_set2 + ['class']

# Extract columns from both datasets
boning_extracted = boning_data[required_columns]
slicing_extracted = slicing_data[required_columns]

# Combine datasets
combined_data = pd.concat([boning_extracted, slicing_extracted], ignore_index=True)
print(f"Data shape: {combined_data.shape}")

# Save the extracted data
combined_data.to_csv('extracted_data.csv', index=False)

Data shape: (72060, 8)


## CREATE COMPOSITE COLUMNS

In [10]:
# Create composite features for column set 1
x1, y1, z1 = combined_data[col_set1[0]], combined_data[col_set1[1]], combined_data[col_set1[2]]

combined_data['rms_xy_set1'] = np.sqrt((x1**2 + y1**2) / 2)
combined_data['rms_yz_set1'] = np.sqrt((y1**2 + z1**2) / 2)
combined_data['rms_zx_set1'] = np.sqrt((z1**2 + x1**2) / 2)
combined_data['rms_xyz_set1'] = np.sqrt((x1**2 + y1**2 + z1**2) / 3)
combined_data['roll_set1'] = 180 * np.arctan2(y1, np.sqrt(x1**2 + z1**2)) / np.pi
combined_data['pitch_set1'] = 180 * np.arctan2(x1, np.sqrt(y1**2 + z1**2)) / np.pi

In [11]:
# Create composite features for column set 2
x2, y2, z2 = combined_data[col_set2[0]], combined_data[col_set2[1]], combined_data[col_set2[2]]

combined_data['rms_xy_set2'] = np.sqrt((x2**2 + y2**2) / 2)
combined_data['rms_yz_set2'] = np.sqrt((y2**2 + z2**2) / 2)
combined_data['rms_zx_set2'] = np.sqrt((z2**2 + x2**2) / 2)
combined_data['rms_xyz_set2'] = np.sqrt((x2**2 + y2**2 + z2**2) / 3)
combined_data['roll_set2'] = 180 * np.arctan2(y2, np.sqrt(x2**2 + z2**2)) / np.pi
combined_data['pitch_set2'] = 180 * np.arctan2(x2, np.sqrt(y2**2 + z2**2)) / np.pi


In [12]:
print(f"Data shape with composite columns: {combined_data.shape}")

# Save data with composite columns
combined_data.to_csv('composite_data.csv', index=False)

Data shape with composite columns: (72060, 20)


## FEATURE COMPUTATION

In [13]:
# Get feature columns (exclude Frame and class)
feature_columns = [col for col in combined_data.columns if col not in ['Frame', 'class']]

In [14]:
# Process each class separately to create features per minute
feature_list = []

for class_val in combined_data['class'].unique():
    class_data = combined_data[combined_data['class'] == class_val].copy()
    class_data = class_data.reset_index(drop=True)  # Reset index for this class

    # Create minute grouping within this class data (60 frames = 1 minute)
    num_complete_minutes = len(class_data) // 60

    for minute in range(num_complete_minutes):
        # Get exactly 60 frames for this minute
        start_idx = minute * 60
        end_idx = start_idx + 60
        minute_data = class_data.iloc[start_idx:end_idx]

        minute_features = {
            'minute': f"{class_val}_{minute}",  # Unique identifier
            'class': class_val
        }

        # Compute statistical features for each feature column
        for col in feature_columns:
            values = minute_data[col].values

            minute_features[f'{col}_mean'] = np.mean(values)
            minute_features[f'{col}_std'] = np.std(values)
            minute_features[f'{col}_min'] = np.min(values)
            minute_features[f'{col}_max'] = np.max(values)
            minute_features[f'{col}_auc'] = np.trapz(np.abs(values))

            # Count peaks
            peaks, _ = find_peaks(values, height=np.mean(values))
            minute_features[f'{col}_peaks'] = len(peaks)

        feature_list.append(minute_features)

In [15]:
# Create feature dataframe
features_df = pd.DataFrame(feature_list)

print(f"Features shape: {features_df.shape}")
print(f"Class distribution: {features_df['class'].value_counts().to_dict()}")
print(f"Total features: {len([col for col in features_df.columns if col not in ['minute', 'class']])}")

Features shape: (1201, 110)
Class distribution: {0: 903, 1: 298}
Total features: 108


In [16]:
# Save feature data
features_df.to_csv('features.csv', index=False)
print("Features saved to 'features.csv'")

Features saved to 'features.csv'


## MODEL TRAINING

In [17]:
# Separate features and target
X = features_df.drop(['minute', 'class'], axis=1)
y = features_df['class']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

results = {}

In [18]:
# 1. Basic SVM - Train-Test Split (only if we have both classes)
if len(y.unique()) >= 2:
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.3, random_state=1, stratify=y
    )

    print("After split - Train classes:", y_train.value_counts().to_dict())
    print("After split - Test classes:", y_test.value_counts().to_dict())

    clf_basic = svm.SVC(random_state=1)
    clf_basic.fit(X_train, y_train)
    y_pred_basic = clf_basic.predict(X_test)
    basic_train_test = accuracy_score(y_test, y_pred_basic)

    print(f"Basic SVM train-test accuracy: {basic_train_test:.4f}")
else:
    print("Cannot proceed with training - need both classes!")

After split - Train classes: {0: 632, 1: 208}
After split - Test classes: {0: 271, 1: 90}
Basic SVM train-test accuracy: 0.8670


In [19]:
# 2. Basic SVM - Cross Validation
scores_basic_cv = cross_val_score(clf_basic, X_scaled, y, cv=10)
basic_cv = scores_basic_cv.mean()

results['Original features'] = {
    'train_test': basic_train_test,
    'cross_val': basic_cv
}

In [20]:
# 3. Hyperparameter Tuning
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],
    'kernel': ['rbf']
}

grid_search = GridSearchCV(svm.SVC(random_state=1), param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)
best_clf = grid_search.best_estimator_

print(f"Best parameters: {grid_search.best_params_}")

Best parameters: {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}


In [21]:
# 4. Tuned SVM - Train-Test Split
y_pred_tuned = best_clf.predict(X_test)
tuned_train_test = accuracy_score(y_test, y_pred_tuned)

In [22]:
# 5. Tuned SVM - Cross Validation
scores_tuned_cv = cross_val_score(best_clf, X_scaled, y, cv=10)
tuned_cv = scores_tuned_cv.mean()

results['With hyper-parameter tuning'] = {
    'train_test': tuned_train_test,
    'cross_val': tuned_cv
}


In [23]:
# 6. Feature Selection (10 best features)
selector = SelectKBest(score_func=f_classif, k=10)
X_selected = selector.fit_transform(X_scaled, y)

X_train_sel, X_test_sel, _, _ = train_test_split(X_selected, y, test_size=0.3, random_state=1)

clf_selected = svm.SVC(**grid_search.best_params_, random_state=1)
clf_selected.fit(X_train_sel, y_train)
y_pred_selected = clf_selected.predict(X_test_sel)
selected_train_test = accuracy_score(y_test, y_pred_selected)

scores_selected_cv = cross_val_score(clf_selected, X_selected, y, cv=10)
selected_cv = scores_selected_cv.mean()

results['With feature selection and hyper parameter tuning'] = {
    'train_test': selected_train_test,
    'cross_val': selected_cv
}

In [24]:
# 7. PCA (10 components)
pca = PCA(n_components=10)
X_pca = pca.fit_transform(X_scaled)

X_train_pca, X_test_pca, _, _ = train_test_split(X_pca, y, test_size=0.3, random_state=1)

clf_pca = svm.SVC(**grid_search.best_params_, random_state=1)
clf_pca.fit(X_train_pca, y_train)
y_pred_pca = clf_pca.predict(X_test_pca)
pca_train_test = accuracy_score(y_test, y_pred_pca)

scores_pca_cv = cross_val_score(clf_pca, X_pca, y, cv=10)
pca_cv = scores_pca_cv.mean()

results['With PCA and hyper parameter tuning'] = {
    'train_test': pca_train_test,
    'cross_val': pca_cv
}


In [25]:
# Train Other Classifiers
other_results = {}

In [26]:
# SGD Classifier
sgd_clf = SGDClassifier(random_state=1, max_iter=1000)
sgd_clf.fit(X_train, y_train)
y_pred_sgd = sgd_clf.predict(X_test)
sgd_train_test = accuracy_score(y_test, y_pred_sgd)
sgd_cv_scores = cross_val_score(sgd_clf, X_scaled, y, cv=10)
sgd_cv = sgd_cv_scores.mean()
other_results['SGD'] = {'train_test': sgd_train_test, 'cross_val': sgd_cv}

In [27]:
# Random Forest Classifier
rf_clf = RandomForestClassifier(n_estimators=100, random_state=1)
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)
rf_train_test = accuracy_score(y_test, y_pred_rf)
rf_cv_scores = cross_val_score(rf_clf, X_scaled, y, cv=10)
rf_cv = rf_cv_scores.mean()
other_results['RandomForest'] = {'train_test': rf_train_test, 'cross_val': rf_cv}

In [28]:
# MLP Classifier
mlp_clf = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=1)
mlp_clf.fit(X_train, y_train)
y_pred_mlp = mlp_clf.predict(X_test)
mlp_train_test = accuracy_score(y_test, y_pred_mlp)
mlp_cv_scores = cross_val_score(mlp_clf, X_scaled, y, cv=10)
mlp_cv = mlp_cv_scores.mean()
other_results['MLP'] = {'train_test': mlp_train_test, 'cross_val': mlp_cv}

In [29]:
print("\n=== SVM MODELS SUMMARY TABLE ===")
print(f"{'SVM Model':<50} {'Train-test split':<20} {'Cross-validation':<20}")
print("-" * 90)
for model_name, scores in results.items():
    train_test_acc = scores['train_test'] * 100
    cross_val_acc = scores['cross_val'] * 100
    print(f"{model_name:<50} {train_test_acc:>18.2f}% {cross_val_acc:>18.2f}%")

print("\n=== ALL CLASSIFIERS COMPARISON ===")
# Add best SVM to other results for comparison
best_svm_key = max(results.keys(), key=lambda k: results[k]['train_test'])
other_results['SVM (Best)'] = results[best_svm_key]

print(f"{'Model':<15} {'Train-test split':<20} {'Cross-validation':<20}")
print("-" * 55)
for model_name, scores in other_results.items():
    train_test_acc = scores['train_test'] * 100
    cross_val_acc = scores['cross_val'] * 100
    print(f"{model_name:<15} {train_test_acc:>18.2f}% {cross_val_acc:>18.2f}%")


=== SVM MODELS SUMMARY TABLE ===
SVM Model                                          Train-test split     Cross-validation    
------------------------------------------------------------------------------------------
Original features                                               86.70%              84.93%
With hyper-parameter tuning                                     85.32%              85.10%
With feature selection and hyper parameter tuning               75.07%              81.68%
With PCA and hyper parameter tuning                             75.07%              84.10%

=== ALL CLASSIFIERS COMPARISON ===
Model           Train-test split     Cross-validation    
-------------------------------------------------------
SGD                          84.49%              82.68%
RandomForest                 85.04%              84.51%
MLP                          84.21%              81.93%
SVM (Best)                   86.70%              84.93%


## MODEL SELECTION

In [30]:
# Find best SVM model
best_svm_model = max(results.keys(), key=lambda k: results[k]['cross_val'])
best_svm_score = results[best_svm_model]['cross_val']

In [31]:
# Find best overall model
best_overall_model = max(other_results.keys(), key=lambda k: other_results[k]['cross_val'])
best_overall_score = other_results[best_overall_model]['cross_val']

print(f"\nBest SVM Model: {best_svm_model} (CV: {best_svm_score:.4f})")
print(f"Best Overall Model: {best_overall_model} (CV: {best_overall_score:.4f})")

print("\n=== MODEL SELECTION ANSWERS ===")
print("1) Best SVM model:", best_svm_model)
print("   Reason: Highest cross-validation accuracy among SVM variants")
print("2) Best ML model:", best_overall_model)
print("   Reason: Highest overall performance across all tested algorithms")


Best SVM Model: With hyper-parameter tuning (CV: 0.8510)
Best Overall Model: SVM (Best) (CV: 0.8493)

=== MODEL SELECTION ANSWERS ===
1) Best SVM model: With hyper-parameter tuning
   Reason: Highest cross-validation accuracy among SVM variants
2) Best ML model: SVM (Best)
   Reason: Highest overall performance across all tested algorithms
