In [20]:
import os
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Paths
UCI_PATH = r'C:\Users\vandan raval\Downloads\human+activity+recognition+using+smartphones\UCI HAR Dataset\UCI HAR Dataset'
MY_DATA_PATH = r'C:\Users\vandan raval\Downloads\Collected_Data_HAR\Collected_Data'  

# Label map
label_map = {
    1: 'WALKING',
    2: 'WALKING_UPSTAIRS',
    3: 'WALKING_DOWNSTAIRS',
    4: 'SITTING',
    5: 'STANDING',
    6: 'LAYING'
}

def load_uci_data(max_samples=100):
    data = []
    labels = []
    for split in ['train', 'test']:
        x = pd.read_csv(f"{UCI_PATH}/{split}/Inertial Signals/total_acc_x_{split}.txt", delim_whitespace=True, header=None)
        y = pd.read_csv(f"{UCI_PATH}/{split}/Inertial Signals/total_acc_y_{split}.txt", delim_whitespace=True, header=None)
        z = pd.read_csv(f"{UCI_PATH}/{split}/Inertial Signals/total_acc_z_{split}.txt", delim_whitespace=True, header=None)
        label = pd.read_csv(f"{UCI_PATH}/{split}/y_{split}.txt", header=None)

        for i in range(min(len(label), max_samples)):
            sample = np.vstack([x.iloc[i], y.iloc[i], z.iloc[i]])
            data.append(sample)
            labels.append(label_map[label.iloc[i, 0]])
    return data, labels


def extract_simple_features(data):
    features = []
    for sample in data:
        mean_features = np.mean(sample, axis=1)  # mean of x,y,z
        std_features = np.std(sample, axis=1)    # std of x,y,z
        features.append(np.hstack([mean_features, std_features]))
    return np.array(features)

def load_your_multiclass_data(data_folder):
    your_data = []
    your_labels = []

    print(f"Listing files in {data_folder}:")
    print(os.listdir(data_folder))

    for root, dirs, files in os.walk(data_folder):
        for file in files:
            if file.endswith('.csv'):
                filepath = os.path.join(root, file)
                print(f"Processing file: {filepath}")
                try:
                    # Skip the first 3 rows assuming they are headers or metadata
                    df = pd.read_csv(filepath, skiprows=3)

                    # Use the correct column names based on the previous output
                    sample = df[['ax (m/s^2)', 'ay (m/s^2)', 'az (m/s^2)']].to_numpy().T
                    your_data.append(sample)

                    name = root.split('/')[-1].lower()  # Extract label from subdirectory name
                    if "walking_upstairs" in name:
                        your_labels.append("WALKING_UPSTAIRS")
                    elif "walking_downstairs" in name:
                        your_labels.append("WALKING_DOWNSTAIRS")
                    elif "walking" in name:
                        your_labels.append("WALKING")
                    elif "sitting" in name:
                        your_labels.append("SITTING")
                    elif "standing" in name:
                        your_labels.append("STANDING")
                    elif "laying" in name:
                        your_labels.append("LAYING")
                    else:
                        your_labels.append("UNKNOWN")
                except pd.errors.ParserError as e:
                    print(f"Error parsing file: {filepath} - {e}")
                except KeyError as e:
                    print(f"KeyError processing file: {filepath} - {e}. Make sure 'ax (m/s^2)', 'ay (m/s^2)', 'az (m/s^2)' columns exist.")


    return your_data, your_labels


uci_data, uci_labels = load_uci_data(max_samples=100)

print("Extracting features from UCI data...")
uci_features = extract_simple_features(uci_data)

X_train, X_test, y_train, y_test = train_test_split(uci_features, uci_labels, test_size=0.2, random_state=42)
model = DecisionTreeClassifier(max_depth=5)
model.fit(X_train, y_train)

print("Evaluating on test set...")
test_preds = model.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, test_preds))

print("Loading your multi-class data...")
your_data, your_labels = load_your_multiclass_data(MY_DATA_PATH)

print("Extracting features from your data...")
your_features = extract_simple_features(your_data)

print("Predicting on your data...")
your_preds = model.predict(your_features)

print("True labels:", your_labels)
print("Predicted labels:", your_preds.tolist())

if len(your_labels) > 0 and len(your_preds) > 0:
    print("Accuracy on your data:", accuracy_score(your_labels, your_preds))
else:
    print("Cannot calculate accuracy: No data or predictions available for your data.")

  x = pd.read_csv(f"{UCI_PATH}/{split}/Inertial Signals/total_acc_x_{split}.txt", delim_whitespace=True, header=None)
  y = pd.read_csv(f"{UCI_PATH}/{split}/Inertial Signals/total_acc_y_{split}.txt", delim_whitespace=True, header=None)
  z = pd.read_csv(f"{UCI_PATH}/{split}/Inertial Signals/total_acc_z_{split}.txt", delim_whitespace=True, header=None)
  x = pd.read_csv(f"{UCI_PATH}/{split}/Inertial Signals/total_acc_x_{split}.txt", delim_whitespace=True, header=None)
  y = pd.read_csv(f"{UCI_PATH}/{split}/Inertial Signals/total_acc_y_{split}.txt", delim_whitespace=True, header=None)
  z = pd.read_csv(f"{UCI_PATH}/{split}/Inertial Signals/total_acc_z_{split}.txt", delim_whitespace=True, header=None)


Extracting features from UCI data...
Evaluating on test set...
Test Accuracy: 0.975
Loading your multi-class data...
Listing files in C:\Users\vandan raval\Downloads\Collected_Data_HAR\Collected_Data:
['LAYING', 'SITTING', 'STANDING', 'WALKING', 'WALKING_DOWNSTAIRS', 'WALKING_UPSTAIRS']
Processing file: C:\Users\vandan raval\Downloads\Collected_Data_HAR\Collected_Data\LAYING\laying 1.csv
Processing file: C:\Users\vandan raval\Downloads\Collected_Data_HAR\Collected_Data\LAYING\laying 2.csv
Processing file: C:\Users\vandan raval\Downloads\Collected_Data_HAR\Collected_Data\LAYING\laying 3.csv
Processing file: C:\Users\vandan raval\Downloads\Collected_Data_HAR\Collected_Data\SITTING\sitting 1.csv
Processing file: C:\Users\vandan raval\Downloads\Collected_Data_HAR\Collected_Data\SITTING\sitting 2.csv
Processing file: C:\Users\vandan raval\Downloads\Collected_Data_HAR\Collected_Data\SITTING\sitting 3.csv
Processing file: C:\Users\vandan raval\Downloads\Collected_Data_HAR\Collected_Data\STAND

In [19]:
import os
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
import matplotlib.pyplot as plt
import tsfel

UCI_PATH = r'C:\Users\vandan raval\Downloads\human+activity+recognition+using+smartphones\UCI HAR Dataset\UCI HAR Dataset'
MY_DATA_PATH = r'C:\Users\vandan raval\Downloads\Collected_Data_HAR\Collected_Data'  

label_map = {
    1: 'WALKING',
    2: 'WALKING_UPSTAIRS',
    3: 'WALKING_DOWNSTAIRS',
    4: 'SITTING',
    5: 'STANDING',
    6: 'LAYING'
}


def load_uci_data(max_samples=100):
    data = []
    labels = []
    for split in ['train', 'test']:
        x = pd.read_csv(f"{UCI_PATH}/{split}/Inertial Signals/total_acc_x_{split}.txt", delim_whitespace=True, header=None)
        y = pd.read_csv(f"{UCI_PATH}/{split}/Inertial Signals/total_acc_y_{split}.txt", delim_whitespace=True, header=None)
        z = pd.read_csv(f"{UCI_PATH}/{split}/Inertial Signals/total_acc_z_{split}.txt", delim_whitespace=True, header=None)
        label = pd.read_csv(f"{UCI_PATH}/{split}/y_{split}.txt", header=None)

        for i in range(min(len(label), max_samples)):
            sample = np.vstack([x.iloc[i], y.iloc[i], z.iloc[i]])
            data.append(sample)
            labels.append(label_map[label.iloc[i, 0]])
    return data, labels

def tsfel_features(data, sampling_rate=50):
    
    all_features = []
    cfg = tsfel.get_features_by_domain()  
    for sample in data:
        df = pd.DataFrame(sample.T, columns=["x","y","z"])
        feature = tsfel.time_series_features_extractor(cfg, df, fs=sampling_rate)
        all_features.append(feature.values.flatten())
    return np.array(all_features)


def load_your_data(folder):
    your_data, your_labels = [], []
    for root, dirs, files in os.walk(folder):
        for file in files:
            if file.endswith('.csv'):
                try:
                    df = pd.read_csv(os.path.join(root, file), skiprows=3)
                    sample = df[['ax (m/s^2)', 'ay (m/s^2)', 'az (m/s^2)']].to_numpy().T
                    your_data.append(sample)

                    name = root.lower()
                    if "walking_upstairs" in name:
                        your_labels.append("WALKING_UPSTAIRS")
                    elif "walking_downstairs" in name:
                        your_labels.append("WALKING_DOWNSTAIRS")
                    elif "walking" in name:
                        your_labels.append("WALKING")
                    elif "sitting" in name:
                        your_labels.append("SITTING")
                    elif "standing" in name:
                        your_labels.append("STANDING")
                    elif "laying" in name:
                        your_labels.append("LAYING")
                except Exception as e:
                    print("Error processing", file, e)
    return your_data, your_labels


uci_data, uci_labels = load_uci_data(max_samples=100)
your_data, your_labels = load_your_data(MY_DATA_PATH)


print("Extracting UCI features")
X = tsfel_features(uci_data)   # change to simple_features(uci_data) if you want simple features
print("Extracting our features")
X_ours = tsfel_features(your_data)

# Train/test split on UCI
X_train, X_test, y_train, y_test = train_test_split(X, uci_labels, test_size=0.2, random_state=42)

# Train model
model = DecisionTreeClassifier(max_depth=6, random_state=42)
model.fit(X_train, y_train)

y_prediction = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_prediction))
print("Precision:", precision_score(y_test, y_prediction, average='weighted'))
print("Recall:", recall_score(y_test, y_prediction, average='weighted'))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_prediction))


if len(your_labels) > 0:
    your_prediction = model.predict(X_ours)
    print("Actual labels :", your_labels)
    print("Predicted labels  :", your_prediction.tolist())
    print("Accuracy:", accuracy_score(your_labels, your_prediction))
else:
    print("No data found.")



  x = pd.read_csv(f"{UCI_PATH}/{split}/Inertial Signals/total_acc_x_{split}.txt", delim_whitespace=True, header=None)
  y = pd.read_csv(f"{UCI_PATH}/{split}/Inertial Signals/total_acc_y_{split}.txt", delim_whitespace=True, header=None)
  z = pd.read_csv(f"{UCI_PATH}/{split}/Inertial Signals/total_acc_z_{split}.txt", delim_whitespace=True, header=None)
  x = pd.read_csv(f"{UCI_PATH}/{split}/Inertial Signals/total_acc_x_{split}.txt", delim_whitespace=True, header=None)
  y = pd.read_csv(f"{UCI_PATH}/{split}/Inertial Signals/total_acc_y_{split}.txt", delim_whitespace=True, header=None)
  z = pd.read_csv(f"{UCI_PATH}/{split}/Inertial Signals/total_acc_z_{split}.txt", delim_whitespace=True, header=None)


Extracting UCI features


Extracting our features


Accuracy: 1.0
Precision: 1.0
Recall: 1.0
Confusion Matrix:
 [[16  0  0  0]
 [ 0  8  0  0]
 [ 0  0  9  0]
 [ 0  0  0  7]]
Actual labels : ['LAYING', 'LAYING', 'LAYING', 'SITTING', 'SITTING', 'SITTING', 'STANDING', 'STANDING', 'STANDING', 'WALKING', 'WALKING', 'WALKING', 'WALKING_DOWNSTAIRS', 'WALKING_DOWNSTAIRS', 'WALKING_DOWNSTAIRS', 'WALKING_UPSTAIRS', 'WALKING_UPSTAIRS', 'WALKING_UPSTAIRS']
Predicted labels  : ['LAYING', 'WALKING', 'WALKING', 'LAYING', 'LAYING', 'LAYING', 'LAYING', 'LAYING', 'LAYING', 'WALKING', 'WALKING', 'WALKING', 'WALKING', 'WALKING', 'WALKING', 'WALKING', 'WALKING', 'WALKING']
Accuracy: 0.2222222222222222


ANS: 
Preprocessing is done by keeping the accelerometer data limmited to 10sec with similar phone alignment throughout recording 
And for Featurization we used 2 methods 
1) Simple features  : kept mean & std for each axis just to be consistent throughout 
2) TSFEL fetaures : Extracted rich features for better pattern recognition

The Decision Tree trained on UCI HAR data achieved good accuracy on the UCI test set, but when applied to our collected data the performance had a dip accuracy wise.
With only mean & std features, the predictions were mostly incorrect which is reason for low accuracy , mainly due to differences in device sensors, placement, and preprocessing.
Using TSFEL features improved the performance by some margin , showing that using rich features can  help the model to increase the generability on new data.