In [103]:
import xgboost as xgb
from sklearn import metrics

import numpy as np
import pandas as pd
import os
import pickle
import itertools

# PATHS

In [3]:
DATA_DIRECTORY = "../data/topviewkinect/"

PREPROCESSED_DIRECTORY = DATA_DIRECTORY + "all/"

DATA_ALL = "all"

# PREPROCESSING

### Sanity check

In [29]:
for dataset_id in next(os.walk(DATA_DIRECTORY))[1]:
    if not dataset_id.isdigit():
        continue

    features_csv = "{data_dir}/{dataset_id}/features.csv".format(data_dir=DATA_DIRECTORY, dataset_id=dataset_id)
    features_df = pd.read_csv(features_csv)
    labels_csv = "{data_dir}/{dataset_id}/labels.csv".format(data_dir=DATA_DIRECTORY, dataset_id=dataset_id)
    labels_df = pd.read_csv(labels_csv)
    
    if -1 in labels_df["activity"].values:
        print(dataset_id, "missing labels")
    
    if 1 in labels_df["skeleton_id"].values:
        print(dataset_id, "multiple people labels")

    if 1 in features_df["skeleton_id"].values:
        print(dataset_id, "multiple people features")

print("Done!")

Done!


### All features and labels

In [5]:
ignored_features_cols = ["frame_id", "skeleton_id", "x", "y", "z"]
ignored_labels_cols = ["frame_id", "skeleton_id"]
all_features_csv = "{data_dir}/{data}_features.csv".format(data_dir=PREPROCESSED_DIRECTORY, data=DATA_ALL)
all_labels_csv = "{data_dir}/{data}_labels.csv".format(data_dir=PREPROCESSED_DIRECTORY, data=DATA_ALL)

In [28]:
# Overwrite previous data files
open(all_features_csv, "w").close()
open(all_labels_csv, "w").close()

# Open data files
all_features_f = open(all_features_csv, "a")
all_labels_f = open(all_labels_csv, "a")
header=True

for dataset_id in next(os.walk(DATA_DIRECTORY))[1]:
    if not dataset_id.isdigit():
        continue
    else:
        print(dataset_id, ", ", end="")

    features_csv = "{data_dir}/{dataset_id}/features.csv".format(data_dir=DATA_DIRECTORY, dataset_id=dataset_id)
    features_df = pd.read_csv(features_csv, low_memory=False)
    labels_csv = "{data_dir}/{dataset_id}/labels.csv".format(data_dir=DATA_DIRECTORY, dataset_id=dataset_id)
    labels_df = pd.read_csv(labels_csv)

    # Keep only tracking skeletons
    labels_df = labels_df.loc[labels_df["skeleton_id"] == 0]
    labels_df = labels_df.loc[labels_df["activity"] != 6]
    frame_indices = labels_df["frame_id"].values
    
    features_df = features_df.loc[features_df["frame_id"].isin(frame_indices)]
    labels_df = labels_df.loc[labels_df["frame_id"].isin(features_df["frame_id"].values)]
    
    # Append features and labels
    features_df = features_df.drop(labels=ignored_features_cols, axis=1)
    features_df["subject"] = int(dataset_id)
    features_df = features_df.astype(np.float32)
    features_df.to_csv(all_features_f, header=header, index=False)
    
    labels_df = labels_df.drop(labels=ignored_labels_cols, axis=1)
    labels_df["subject"] = int(dataset_id)
    labels_df = labels_df.astype(np.int)
    labels_df.to_csv(all_labels_f, header=header, index=False)
    
    header = False
    
all_features_f.close()
all_labels_f.close()

print("\nDone!")

1 , 10 , 11 , 12 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 
Done!


### Data overview

In [6]:
all_features_df = pd.read_csv(all_features_csv)
all_labels_df = pd.read_csv(all_labels_csv)

In [7]:
all_features_df.shape

(77024, 73)

In [8]:
all_features_df.head()

Unnamed: 0,layer_area_0,layer_area_1,layer_area_2,layer_contours_0,layer_contours_1,layer_distance_0,layer_distance_1,layer_distance_2,layer_distance_3,layer_distance_4,...,interlayer_pos_16,interlayer_pos_17,extremities0,extreme_infrared_0,extreme_infrared_1,extreme_infrared_2,extreme_infrared_3,extreme_infrared_4,extreme_infrared_5,subject
0,0.297578,0.411765,0.290657,3.0,3.0,16.5529,26.6833,26.019199,26.6833,201.0,...,-26.0,-107.0,4.0,0.0,10.0,11.5,11.5,0.0,11.5,1.0
1,0.310345,0.419238,0.270417,3.0,3.0,16.401199,26.4764,26.019199,26.4764,191.5,...,-26.0,-105.0,5.0,0.5,9.0,11.0,1.0,0.5,11.0,1.0
2,0.318015,0.386029,0.295956,3.0,3.0,16.1245,26.2488,27.018499,26.2488,174.5,...,-26.0,-104.0,5.0,0.0,12.5,4.5,4.5,0.5,13.0,1.0
3,0.348399,0.384181,0.26742,3.0,3.0,16.401199,26.419701,26.4764,26.419701,164.0,...,-25.0,-103.0,5.0,0.0,6.0,4.5,0.0,0.0,7.0,1.0
4,0.356383,0.370567,0.27305,3.0,3.0,17.719999,27.459101,27.459101,27.459101,164.5,...,-26.0,-107.0,3.0,0.0,0.0,0.5,0.0,0.0,0.5,1.0


In [9]:
all_labels_df.shape

(77024, 4)

In [10]:
all_labels_df.head()

Unnamed: 0,activity,orientation,orientation_accurate,subject
0,0,130,-1,1
1,0,130,-1,1
2,0,120,-1,1
3,0,130,-1,1
4,0,150,-1,1


In [11]:
subjects_list = np.unique(all_labels_df["subject"])
subjects_list

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12], dtype=int64)

In [12]:
activities_list = np.unique(all_labels_df["activity"])
activities_list

array([0, 1, 2, 3, 4, 5], dtype=int64)

In [94]:
X = all_features_df.drop(["subject"], axis=1)
y = all_labels_df[["activity"]]
X_all = xgb.DMatrix(X, y)

###  Data split

#### Initial Cross-Subject Test (2-fold)

In [34]:
initial_cs_odd = [subject for subject in subjects_list if subject % 2 == 1]
initial_cs_even = [subject for subject in subjects_list if subject % 2 == 0]
initial_cs_split = [
    {"train": initial_cs_odd, "test": initial_cs_even},
    {"train": initial_cs_even, "test": initial_cs_odd}
]

In [35]:
initial_cs_split_fn = "{data_dir}/{data}_split_init_cs.pickle".format(data_dir=PREPROCESSED_DIRECTORY, data=DATA_ALL)

In [36]:
with open(initial_cs_split_fn, "wb") as f:
    pickle.dump(initial_cs_split, f)

#### Complete Cross-Subject Test (2-fold)

In [22]:
complete_cs_train_indices = list(itertools.combinations(subjects_list, int(len(subjects_list)/2)))
complete_cs_split = [
    {"train": [train_indices], "test": [set(subjects_list) - set(train_indices)]} for train_indices in complete_cs_train_indices
]

In [23]:
complete_cs_split_fn = "{data_dir}/{data}_split_complete_cs.pickle".format(data_dir=PREPROCESSED_DIRECTORY, data=DATA_ALL)

In [24]:
with open(complete_cs_split_fn, "wb") as f:
    pickle.dump(complete_cs_split, f)

#### N-Subject-Fold (12-fold)

In [25]:
n_subject_split = [{"train": [set(subjects_list) - {test_idx}], "test": [test_idx]} for test_idx in subjects_list]

In [26]:
n_subject_split_fn = "{data_dir}/{data}_split_n_subject.pickle".format(data_dir=PREPROCESSED_DIRECTORY, data=DATA_ALL)

In [27]:
with open(n_subject_split_fn, "wb") as f:
    pickle.dump(n_subject_split, f)

# TRAINING

#### Initial Cross-Subject Test

In [37]:
with open(initial_cs_split_fn, "rb") as f:
    initial_cs_split = pickle.load(f)

In [68]:
PARAMS = {}
PARAMS["eta"] = 0.3
PARAMS["gamma"] = 1
PARAMS["lambda"] = 1
PARAMS["alpha"] = 0
PARAMS["max_depth"] = 6
PARAMS["colsample_bytree"] = 0.5
PARAMS["colsample_bylevel"] = 0.5
PARAMS["subsample"] = 0.5
PARAMS["objective"] = "multi:softmax"
PARAMS["eval_metric"] = "merror"
PARAMS["num_class"] = len(activities_list)
PARAMS["silent"] = 0
NUM_ROUNDS = 200
EARLYSTOPPING_ROUNDS = 30

In [74]:
# Find best iteration by early stopping

init_cs_results = []

for train_test in initial_cs_split:
    train_indices = all_labels_df[all_labels_df["subject"].isin(train_test["train"])].index.tolist()
    test_indices = all_labels_df[all_labels_df["subject"].isin(train_test["test"])].index.tolist()
    
    X_train = X.iloc[train_indices]
    X_test = X.iloc[test_indices]
    y_train = y.iloc[train_indices]
    y_test = y.iloc[test_indices]
    
    train_dmatrix = xgb.DMatrix(X_train, y_train)
    test_dmatrix = xgb.DMatrix(X_test, y_test)
    watchlist = [(train_dmatrix, "train"), (test_dmatrix, "eval")]
    results = {}
    
    xgb.train(params=PARAMS, dtrain=train_dmatrix, num_boost_round=NUM_ROUNDS, evals=watchlist, evals_result=results,
                early_stopping_rounds=EARLYSTOPPING_ROUNDS)
    
    init_cs_results.append(results["eval"]["merror"])

[0]	train-merror:0.055888	eval-merror:0.256446
Multiple eval metrics have been passed: 'eval-merror' will be used for early stopping.

Will train until eval-merror hasn't improved in 30 rounds.
[1]	train-merror:0.034998	eval-merror:0.235604
[2]	train-merror:0.028044	eval-merror:0.219468
[3]	train-merror:0.025755	eval-merror:0.189738
[4]	train-merror:0.023237	eval-merror:0.181777
[5]	train-merror:0.020976	eval-merror:0.180898
[6]	train-merror:0.0182	eval-merror:0.16911
[7]	train-merror:0.016855	eval-merror:0.16172
[8]	train-merror:0.015424	eval-merror:0.15806
[9]	train-merror:0.014823	eval-merror:0.159795
[10]	train-merror:0.013764	eval-merror:0.16046
[11]	train-merror:0.01282	eval-merror:0.157062
[12]	train-merror:0.011876	eval-merror:0.151762
[13]	train-merror:0.010931	eval-merror:0.145369
[14]	train-merror:0.009959	eval-merror:0.145346
[15]	train-merror:0.008986	eval-merror:0.145132
[16]	train-merror:0.008299	eval-merror:0.142708
[17]	train-merror:0.007612	eval-merror:0.143492
[18]	t

In [96]:
# Average across results

init_cs_results_avg = [(x + y) / 2 for (x, y) in zip(*init_cs_results)]

init_cs_min_error = init_cs_results_avg[0]
init_cs_best_iter = 0
init_cs_early_stopping = 0
for i, error in enumerate(init_cs_results_avg):
    if error <= init_cs_min_error:
        init_cs_min_error = error
        init_cs_best_iter = i
        init_cs_early_stopping = 1
    else:
        init_cs_early_stopping += 1
    if init_cs_early_stopping == 30:
        break

In [97]:
# Train Booster

init_cs_booster = xgb.train(params=PARAMS, dtrain=X_all, num_boost_round=init_cs_best_iter+1)

In [99]:
# Save model

init_cs_booster.save_model("init_cs.model")

In [104]:
y_predicted = init_cs_booster.predict(X_all)
accuracy = metrics.accuracy_score(y, y_predicted)
accuracy

0.9984420440382219