In [1]:
# For evaluation
from sklearn import metrics
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
# For math/arrays
import numpy as np
# For plotting
import matplotlib.pyplot as plt
# For importing and processing HybrIK files
import os
import pickle
import scipy.optimize
import time

## Data Preprocessing

In [2]:
def extract_gait_cycle(feet_distances):
    '''
    To extract gait cycle according to the pattern of graph
    Output: a list of estimated start and end index for gait cycles
    '''
    y = feet_distances
    close_feet_count = 0
    cycle_start = 0

    if y[1] < y[0] :
        open_feet = False
        cycle_start = -1
    else:
        open_feet = True

    cycle_list = list()

    for i in range(1, len(feet_distances)):
        if y[i] < y[i-1] and open_feet:
            open_feet = False
        elif y[i] > y[i-1] and not open_feet:
            open_feet = True
            if cycle_start == -1:
                cycle_start = i
                continue

            close_feet_count = close_feet_count + 1
            if close_feet_count % 2 == 0:
                cycle_end = i - 1
                cycle_list.append((cycle_start, cycle_end))

                cycle_start = cycle_end

    return cycle_list

In [3]:
def fit_sin(tt, yy):
    '''
    Use coordinates given to produce sine graph
    '''
    tt = np.array(tt)
    yy = np.array(yy)

    ff = np.fft.fftfreq(len(tt), (tt[1]-tt[0]))
    Fyy = abs(np.fft.fft(yy))
    guess_freq = abs(ff[np.argmax(Fyy[1:])+1])
    guess_amp = np.std(yy) * 2.**0.5
    guess_offset = np.mean(yy)
    guess = np.array([guess_amp, 2.*np.pi*guess_freq, 0., guess_offset])

    def sinfunc(t, A, w, p, c):  return A * np.sin(w*t + p) + c
    popt, pcov = scipy.optimize.curve_fit(sinfunc, tt, yy, p0=guess)
    A, w, p, c = popt
    f = w/(2.*np.pi)
    fitfunc = lambda t: A * np.sin(w*t + p) + c
    return {"amp": A, "omega": w, "phase": p, "offset": c, "freq": f, "period": 1./f, "fitfunc": fitfunc, "maxcov": np.max(pcov), "rawres": (guess,popt,pcov)}

In [4]:
def process_single_file (data, coord_pos) :
    '''
    data: all data from one single pickle file
    coord_pos: 0 for x coordinates, 1 for y coordinates, 2 for z coordinates
    '''
    LEFT_ANKLE = 7
    RIGHT_ANKLE = 8
    feet_distances = []
    gait_cycles = []

    # Get distance between left and right ankles as feet distance
    for i, frame in enumerate(data['pred_uvd']):
        LAnkle = frame[LEFT_ANKLE, coord_pos]
        RAnkle = frame[RIGHT_ANKLE, coord_pos]

        feet_distance = abs(LAnkle - RAnkle)
        feet_distances.append(feet_distance)

    # Fit the feet distances data into sine graph,
    # to remove outliers or inconsistent patterns
    x = np.array(range(0, len(feet_distances)))
    try:
        f = fit_sin(x, feet_distances)["fitfunc"]
        y = f(x)
        gait_cycles = extract_gait_cycle(y)
        #print(gait_cycles)
    except:
        # Fit the feet distances data into polynomial graph,
        # when it cannot fit into sine graph or no gait cycles found from sine graph
        if len(gait_cycles) <= 0:
            from scipy.interpolate import splrep, splev
            max_distance = max(feet_distances)
            smoothness = max_distance ** 2
            bspl = splrep(x,feet_distances,s=smoothness)
            y = splev(x,bspl)
            gait_cycles = extract_gait_cycle(y)
            #print(gait_cycles)

    return gait_cycles

In [5]:
def extract_gait_information(file, data):
    '''
    Get gait start and end index lists according to the gait's view
    '''
    if 'front' in file.lower() or 'back' in file.lower():
        return process_single_file(data, 2)
    elif 'left' in file.lower() or 'right' in file.lower():
        return process_single_file(data, 0)

In [6]:
def extract_data(input_file, max_length):
    '''
    Extract all samples and labels from the input file.
    Return any file not met maximum sequence length as error files
    '''
    all_samples = []
    all_labels = []
    error_files = []

    for file in os.listdir(input_file):

        # Process only HybrIK pickle files
        if not file.endswith('.pk'):
            continue

        # Extract features with shape (sample_size, frame_num, keypoints_num * xyz)
        file_path = os.path.join(input_file, file)
        with open(file_path, 'rb') as f:
            data = pickle.load(f)

        full_keypoints = data['pred_xyz_24_struct']
        #full_keypoints = data['pred_xyz_29']
        total_frames = len(full_keypoints)

        gait_cycles = extract_gait_information(file, data)
        mid_index = len(gait_cycles) // 2

        # Take middle gait cycles if possible
        start_index = gait_cycles[mid_index][0] if len(gait_cycles) > 0 else 0
        final_index = start_index + max_length - 1

        # If max sequence length is greater than total frames of files,
        # the file will not be used for processing
        if final_index >= total_frames:
            if max_length <= total_frames:
                start_index = 0
                final_index = max_length - 1
            else:
                error_files.append((file, total_frames))
                continue

        keypoints = []

        for i in range(start_index, final_index + 1) :
            #keypoints.append(list(full_keypoints[i].flat))
            keypoints.extend(list(full_keypoints[i].flat))

        # Extract label
        # Check for gender
        if 'F' in file:
            gender = 1
        else:
            gender = 0

        # Check for age group
        from drive.MyDrive.model_preprocessing import process
        age = process(file)
        #age = int(file.split("_")[3])
        if age <= 0:
            continue
        elif age < 15:
            group = 0 # Child group
        elif age < 65:
            group = 1 # Adult group
        else:
            group = 2 # Senior group

        # Male child = 0
        # Female child = 1
        # Male adult = 2
        # Female adult = 3
        # Male senior = 4
        # Female senior = 5
        label = 2 * group + gender

        all_samples.append(keypoints)
        all_labels.append(label)

    return all_samples, all_labels, error_files

In [7]:
# Locate dataset for train, validation and test set
# Connect to Google Drive
from google.colab import drive
drive.mount('/content/drive')
input_folder = "/content/drive/MyDrive"
test_folder = os.path.join(input_folder, "PK_test_set")
train_folder = os.path.join(input_folder, "PK_train_set")
validation_folder = os.path.join(input_folder, "PK_validation_set")

# Set sequence length
sequence_length = 60

# Retrieve dataset
all_error_files = []
X_train, y_train, error_files = extract_data(train_folder, sequence_length)
all_error_files.extend(error_files)
X_test, y_test, error_files = extract_data(test_folder, sequence_length)
all_error_files.extend(error_files)
X_val, y_val, error_files = extract_data(validation_folder, sequence_length)
all_error_files.extend(error_files)

# Print processing results
print("Train set:", len(X_train), "\nTest set:", len(X_test), "\nValidation set:", len(X_val))
if len(all_error_files) > 0:
    print("Error in processing files:")
    for file in all_error_files:
        print(f"\t- {file[0]} ({file[1]} frames)")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Train set: 877 
Test set: 274 
Validation set: 219


In [8]:
# Calculate number of output classes
n_classes = len(np.unique(y_train))

# Calculate dimension of vector that similar to embedding
num_coord = 24
xyz = 3
embed_dim = num_coord * xyz

print(n_classes)

6


In [9]:
# Check frequency of each class in train dataset
label, counts = np.unique(y_train, return_counts=True)
#label, counts = np.unique(y_train_resampled, return_counts=True)
print(dict(zip(label, counts)))

{np.int64(0): np.int64(118), np.int64(1): np.int64(75), np.int64(2): np.int64(249), np.int64(3): np.int64(305), np.int64(4): np.int64(72), np.int64(5): np.int64(58)}


## Model Training

In [10]:
from sklearn.neighbors import KNeighborsClassifier

t = []
acc = []
pcs = []
rc = []

for i in range(5):
    print(f"-------------- Training {i + 1} ------------------")
    # Setting seed for reproducibility
    np.random.seed(42)

    # Create model
    model = KNeighborsClassifier(n_neighbors = 5)

    # Train model and record execution time
    start = time.time()
    model.fit(X_train, y_train)

    print("Time elapsed:", time.time() - start, "s")
    t.append(time.time() - start)

    # Evaluate on test dataset
    y_pred = model.predict(X_test)
    print("Accuracy: %.2f%%" % (metrics.accuracy_score(y_test, y_pred) * 100))
    print("Precision: %.2f%%" % (metrics.precision_score(y_test, y_pred, average='micro') * 100))
    print("Recall: %.2f%%" % (metrics.recall_score(y_test, y_pred, average='micro') * 100))
    acc.append(metrics.accuracy_score(y_test, y_pred) * 100)
    pcs.append(metrics.precision_score(y_test, y_pred, average='micro') * 100)
    rc.append(metrics.recall_score(y_test, y_pred, average='micro') * 100)

print("--------------------------------------------------------------")
print("Average execution time (s):", np.mean(t))
print("SD of execution time:", np.std(t))
print("Average accuracy:", np.mean(acc))
print("SD accuracy:", np.std(acc))
print("Average precision:", np.mean(pcs))
print("SD precision:", np.std(pcs))
print("Average recall:", np.mean(rc))
print("SD recall:", np.std(rc))

-------------- Training 1 ------------------
Time elapsed: 0.19968914985656738 s
Accuracy: 42.70%
Precision: 42.70%
Recall: 42.70%
-------------- Training 2 ------------------
Time elapsed: 0.16350674629211426 s
Accuracy: 42.70%
Precision: 42.70%
Recall: 42.70%
-------------- Training 3 ------------------
Time elapsed: 0.16484451293945312 s
Accuracy: 42.70%
Precision: 42.70%
Recall: 42.70%
-------------- Training 4 ------------------
Time elapsed: 0.16659212112426758 s
Accuracy: 42.70%
Precision: 42.70%
Recall: 42.70%
-------------- Training 5 ------------------
Time elapsed: 0.1609504222869873 s
Accuracy: 42.70%
Precision: 42.70%
Recall: 42.70%
--------------------------------------------------------------
Average execution time (s): 0.17115373611450196
SD of execution time: 0.014398593032943907
Average accuracy: 42.700729927007295
SD accuracy: 0.0
Average precision: 42.700729927007295
SD precision: 0.0
Average recall: 42.700729927007295
SD recall: 0.0


In [11]:
from sklearn.naive_bayes import GaussianNB

t = []
acc = []
pcs = []
rc = []

for i in range(5):
    print(f"-------------- Training {i + 1} ------------------")
    # Setting seed for reproducibility
    np.random.seed(42)

    # Create model
    model = GaussianNB()

    # Train model and record execution time
    start = time.time()
    model.fit(X_train, y_train)

    print("Time elapsed:", time.time() - start, "s")
    t.append(time.time() - start)

    # Evaluate on test dataset
    y_pred = model.predict(X_test)
    print("Accuracy: %.2f%%" % (metrics.accuracy_score(y_test, y_pred) * 100))
    print("Precision: %.2f%%" % (metrics.precision_score(y_test, y_pred, average='micro') * 100))
    print("Recall: %.2f%%" % (metrics.recall_score(y_test, y_pred, average='micro') * 100))
    acc.append(metrics.accuracy_score(y_test, y_pred) * 100)
    pcs.append(metrics.precision_score(y_test, y_pred, average='micro') * 100)
    rc.append(metrics.recall_score(y_test, y_pred, average='micro') * 100)

print("--------------------------------------------------------------")
print("Average execution time (s):", np.mean(t))
print("SD of execution time:", np.std(t))
print("Average accuracy:", np.mean(acc))
print("SD accuracy:", np.std(acc))
print("Average precision:", np.mean(pcs))
print("SD precision:", np.std(pcs))
print("Average recall:", np.mean(rc))
print("SD recall:", np.std(rc))

-------------- Training 1 ------------------
Time elapsed: 0.18431758880615234 s
Accuracy: 24.09%
Precision: 24.09%
Recall: 24.09%
-------------- Training 2 ------------------
Time elapsed: 0.19162917137145996 s
Accuracy: 24.09%
Precision: 24.09%
Recall: 24.09%
-------------- Training 3 ------------------
Time elapsed: 0.20212435722351074 s
Accuracy: 24.09%
Precision: 24.09%
Recall: 24.09%
-------------- Training 4 ------------------
Time elapsed: 0.18126893043518066 s
Accuracy: 24.09%
Precision: 24.09%
Recall: 24.09%
-------------- Training 5 ------------------
Time elapsed: 0.1826488971710205 s
Accuracy: 24.09%
Precision: 24.09%
Recall: 24.09%
--------------------------------------------------------------
Average execution time (s): 0.18848199844360353
SD of execution time: 0.007740059990452046
Average accuracy: 24.087591240875913
SD accuracy: 0.0
Average precision: 24.087591240875913
SD precision: 0.0
Average recall: 24.087591240875913
SD recall: 0.0


In [12]:
from sklearn.ensemble import RandomForestClassifier

t = []
acc = []
pcs = []
rc = []

for i in range(5):
    print(f"-------------- Training {i + 1} ------------------")
    # Setting seed for reproducibility
    np.random.seed(42)

    # Create model
    model = RandomForestClassifier()

    # Train model and record execution time
    start = time.time()
    model.fit(X_train, y_train)

    print("Time elapsed:", time.time() - start, "s")
    t.append(time.time() - start)

    # Evaluate on test dataset
    y_pred = model.predict(X_test)
    print("Accuracy: %.2f%%" % (metrics.accuracy_score(y_test, y_pred) * 100))
    print("Precision: %.2f%%" % (metrics.precision_score(y_test, y_pred, average='micro') * 100))
    print("Recall: %.2f%%" % (metrics.recall_score(y_test, y_pred, average='micro') * 100))
    acc.append(metrics.accuracy_score(y_test, y_pred) * 100)
    pcs.append(metrics.precision_score(y_test, y_pred, average='micro') * 100)
    rc.append(metrics.recall_score(y_test, y_pred, average='micro') * 100)

print("--------------------------------------------------------------")
print("Average execution time (s):", np.mean(t))
print("SD of execution time:", np.std(t))
print("Average accuracy:", np.mean(acc))
print("SD accuracy:", np.std(acc))
print("Average precision:", np.mean(pcs))
print("SD precision:", np.std(pcs))
print("Average recall:", np.mean(rc))
print("SD recall:", np.std(rc))

-------------- Training 1 ------------------
Time elapsed: 5.039495468139648 s
Accuracy: 57.66%
Precision: 57.66%
Recall: 57.66%
-------------- Training 2 ------------------
Time elapsed: 5.468954801559448 s
Accuracy: 57.66%
Precision: 57.66%
Recall: 57.66%
-------------- Training 3 ------------------
Time elapsed: 4.744982957839966 s
Accuracy: 57.66%
Precision: 57.66%
Recall: 57.66%
-------------- Training 4 ------------------
Time elapsed: 5.668520450592041 s
Accuracy: 57.66%
Precision: 57.66%
Recall: 57.66%
-------------- Training 5 ------------------
Time elapsed: 4.720388174057007 s
Accuracy: 57.66%
Precision: 57.66%
Recall: 57.66%
--------------------------------------------------------------
Average execution time (s): 5.128598070144653
SD of execution time: 0.38184648858595455
Average accuracy: 57.66423357664233
SD accuracy: 0.0
Average precision: 57.66423357664233
SD precision: 0.0
Average recall: 57.66423357664233
SD recall: 0.0


In [13]:
from sklearn.svm import SVC

t = []
acc = []
pcs = []
rc = []

for i in range(5):
    print(f"-------------- Training {i + 1} ------------------")
    # Setting seed for reproducibility
    np.random.seed(42)

    # Create model
    model = SVC(kernel='linear', decision_function_shape='ovr')

    # Train model and record execution time
    start = time.time()
    model.fit(X_train, y_train)

    print("Time elapsed:", time.time() - start, "s")
    t.append(time.time() - start)

    # Evaluate on test dataset
    y_pred = model.predict(X_test)
    print("Accuracy: %.2f%%" % (metrics.accuracy_score(y_test, y_pred) * 100))
    print("Precision: %.2f%%" % (metrics.precision_score(y_test, y_pred, average='micro') * 100))
    print("Recall: %.2f%%" % (metrics.recall_score(y_test, y_pred, average='micro') * 100))
    acc.append(metrics.accuracy_score(y_test, y_pred) * 100)
    pcs.append(metrics.precision_score(y_test, y_pred, average='micro') * 100)
    rc.append(metrics.recall_score(y_test, y_pred, average='micro') * 100)

print("--------------------------------------------------------------")
print("Average execution time (s):", np.mean(t))
print("SD of execution time:", np.std(t))
print("Average accuracy:", np.mean(acc))
print("SD accuracy:", np.std(acc))
print("Average precision:", np.mean(pcs))
print("SD precision:", np.std(pcs))
print("Average recall:", np.mean(rc))
print("SD recall:", np.std(rc))

-------------- Training 1 ------------------
Time elapsed: 2.2488181591033936 s
Accuracy: 41.24%
Precision: 41.24%
Recall: 41.24%
-------------- Training 2 ------------------
Time elapsed: 2.318105936050415 s
Accuracy: 41.24%
Precision: 41.24%
Recall: 41.24%
-------------- Training 3 ------------------
Time elapsed: 2.9038610458374023 s
Accuracy: 41.24%
Precision: 41.24%
Recall: 41.24%
-------------- Training 4 ------------------
Time elapsed: 2.2785913944244385 s
Accuracy: 41.24%
Precision: 41.24%
Recall: 41.24%
-------------- Training 5 ------------------
Time elapsed: 2.1925034523010254 s
Accuracy: 41.24%
Precision: 41.24%
Recall: 41.24%
--------------------------------------------------------------
Average execution time (s): 2.3885583877563477
SD of execution time: 0.2609507730060529
Average accuracy: 41.24087591240876
SD accuracy: 0.0
Average precision: 41.24087591240876
SD precision: 0.0
Average recall: 41.24087591240876
SD recall: 0.0


In [14]:
from sklearn.ensemble import GradientBoostingClassifier

t = []
acc = []
pcs = []
rc = []

for i in range(5):
    print(f"-------------- Training {i + 1} ------------------")
    # Setting seed for reproducibility
    np.random.seed(42)

    # Create model
    model = GradientBoostingClassifier()

    # Train model and record execution time
    start = time.time()
    model.fit(X_train, y_train)

    print("Time elapsed:", time.time() - start, "s")
    t.append(time.time() - start)

    # Evaluate on test dataset
    y_pred = model.predict(X_test)
    print("Accuracy: %.2f%%" % (metrics.accuracy_score(y_test, y_pred) * 100))
    print("Precision: %.2f%%" % (metrics.precision_score(y_test, y_pred, average='micro') * 100))
    print("Recall: %.2f%%" % (metrics.recall_score(y_test, y_pred, average='micro') * 100))
    acc.append(metrics.accuracy_score(y_test, y_pred) * 100)
    pcs.append(metrics.precision_score(y_test, y_pred, average='micro') * 100)
    rc.append(metrics.recall_score(y_test, y_pred, average='micro') * 100)

print("--------------------------------------------------------------")
print("Average execution time (s):", np.mean(t))
print("SD of execution time:", np.std(t))
print("Average accuracy:", np.mean(acc))
print("SD accuracy:", np.std(acc))
print("Average precision:", np.mean(pcs))
print("SD precision:", np.std(pcs))
print("Average recall:", np.mean(rc))
print("SD recall:", np.std(rc))

-------------- Training 1 ------------------
Time elapsed: 873.0210585594177 s
Accuracy: 52.92%
Precision: 52.92%
Recall: 52.92%
-------------- Training 2 ------------------
Time elapsed: 870.6592564582825 s
Accuracy: 52.92%
Precision: 52.92%
Recall: 52.92%
-------------- Training 3 ------------------
Time elapsed: 869.6433811187744 s
Accuracy: 52.92%
Precision: 52.92%
Recall: 52.92%
-------------- Training 4 ------------------
Time elapsed: 873.4864819049835 s
Accuracy: 52.92%
Precision: 52.92%
Recall: 52.92%
-------------- Training 5 ------------------
Time elapsed: 869.2877688407898 s
Accuracy: 52.92%
Precision: 52.92%
Recall: 52.92%
--------------------------------------------------------------
Average execution time (s): 871.219718503952
SD of execution time: 1.7271025181959565
Average accuracy: 52.91970802919708
SD accuracy: 7.105427357601002e-15
Average precision: 52.91970802919708
SD precision: 7.105427357601002e-15
Average recall: 52.91970802919708
SD recall: 7.105427357601002