In [1]:
import xgboost as xgb
from sklearn import model_selection
from sklearn import metrics

import numpy as np
import pandas as pd
import os
import pickle
import itertools

# PATHS

In [2]:
DATA_DIRECTORY = "../data/topviewkinect/"

PREPROCESSED_DIRECTORY = DATA_DIRECTORY + "all/"

DATA_ALL = "all"

# PREPROCESSING

### Sanity check

In [29]:
for dataset_id in next(os.walk(DATA_DIRECTORY))[1]:
    if not dataset_id.isdigit():
        continue

    features_csv = "{data_dir}/{dataset_id}/features.csv".format(data_dir=DATA_DIRECTORY, dataset_id=dataset_id)
    features_df = pd.read_csv(features_csv)
    labels_csv = "{data_dir}/{dataset_id}/labels.csv".format(data_dir=DATA_DIRECTORY, dataset_id=dataset_id)
    labels_df = pd.read_csv(labels_csv)
    
    if -1 in labels_df["activity"].values:
        print(dataset_id, "missing labels")
    
    if 1 in labels_df["skeleton_id"].values:
        print(dataset_id, "multiple people labels")

    if 1 in features_df["skeleton_id"].values:
        print(dataset_id, "multiple people features")

print("Done!")

Done!


### All features and labels

In [3]:
ignored_features_cols = ["frame_id", "skeleton_id", "x", "y", "z"]
ignored_labels_cols = ["frame_id", "skeleton_id"]

In [4]:
all_features_csv = "{data_dir}/{data}_features.csv".format(data_dir=PREPROCESSED_DIRECTORY, data=DATA_ALL)
all_labels_csv = "{data_dir}/{data}_labels.csv".format(data_dir=PREPROCESSED_DIRECTORY, data=DATA_ALL)

In [28]:
# Overwrite previous data files
open(all_features_csv, "w").close()
open(all_labels_csv, "w").close()

# Open data files
all_features_f = open(all_features_csv, "a")
all_labels_f = open(all_labels_csv, "a")
header=True

for dataset_id in next(os.walk(DATA_DIRECTORY))[1]:
    if not dataset_id.isdigit():
        continue
    else:
        print(dataset_id, ", ", end="")

    features_csv = "{data_dir}/{dataset_id}/features.csv".format(data_dir=DATA_DIRECTORY, dataset_id=dataset_id)
    features_df = pd.read_csv(features_csv, low_memory=False)
    labels_csv = "{data_dir}/{dataset_id}/labels.csv".format(data_dir=DATA_DIRECTORY, dataset_id=dataset_id)
    labels_df = pd.read_csv(labels_csv)

    # Keep only tracking skeletons
    labels_df = labels_df.loc[labels_df["skeleton_id"] == 0]
    labels_df = labels_df.loc[labels_df["activity"] != 6]
    frame_indices = labels_df["frame_id"].values
    
    features_df = features_df.loc[features_df["frame_id"].isin(frame_indices)]
    labels_df = labels_df.loc[labels_df["frame_id"].isin(features_df["frame_id"].values)]
    
    # Append features and labels
    features_df = features_df.drop(labels=ignored_features_cols, axis=1)
    features_df["subject"] = int(dataset_id)
    features_df = features_df.astype(np.float32)
    features_df.to_csv(all_features_f, header=header, index=False)
    
    labels_df = labels_df.drop(labels=ignored_labels_cols, axis=1)
    labels_df["subject"] = int(dataset_id)
    labels_df = labels_df.astype(np.int)
    labels_df.to_csv(all_labels_f, header=header, index=False)
    
    header = False
    
all_features_f.close()
all_labels_f.close()

print("\nDone!")

1 , 10 , 11 , 12 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 
Done!


### Data overview

In [5]:
all_features_df = pd.read_csv(all_features_csv)
all_labels_df = pd.read_csv(all_labels_csv)

In [6]:
all_features_df.shape

(77024, 73)

In [7]:
all_features_df.head()

Unnamed: 0,layer_area_0,layer_area_1,layer_area_2,layer_contours_0,layer_contours_1,layer_distance_0,layer_distance_1,layer_distance_2,layer_distance_3,layer_distance_4,...,interlayer_pos_16,interlayer_pos_17,extremities0,extreme_infrared_0,extreme_infrared_1,extreme_infrared_2,extreme_infrared_3,extreme_infrared_4,extreme_infrared_5,subject
0,0.297578,0.411765,0.290657,3.0,3.0,16.5529,26.6833,26.019199,26.6833,201.0,...,-26.0,-107.0,4.0,0.0,10.0,11.5,11.5,0.0,11.5,1.0
1,0.310345,0.419238,0.270417,3.0,3.0,16.401199,26.4764,26.019199,26.4764,191.5,...,-26.0,-105.0,5.0,0.5,9.0,11.0,1.0,0.5,11.0,1.0
2,0.318015,0.386029,0.295956,3.0,3.0,16.1245,26.2488,27.018499,26.2488,174.5,...,-26.0,-104.0,5.0,0.0,12.5,4.5,4.5,0.5,13.0,1.0
3,0.348399,0.384181,0.26742,3.0,3.0,16.401199,26.419701,26.4764,26.419701,164.0,...,-25.0,-103.0,5.0,0.0,6.0,4.5,0.0,0.0,7.0,1.0
4,0.356383,0.370567,0.27305,3.0,3.0,17.719999,27.459101,27.459101,27.459101,164.5,...,-26.0,-107.0,3.0,0.0,0.0,0.5,0.0,0.0,0.5,1.0


In [8]:
all_labels_df.shape

(77024, 4)

In [9]:
all_labels_df.head()

Unnamed: 0,activity,orientation,orientation_accurate,subject
0,0,130,-1,1
1,0,130,-1,1
2,0,120,-1,1
3,0,130,-1,1
4,0,150,-1,1


In [10]:
subjects_list = np.unique(all_labels_df["subject"])
subjects_list

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12], dtype=int64)

In [11]:
activities_list = np.unique(all_labels_df["activity"])
activities_list

array([0, 1, 2, 3, 4, 5], dtype=int64)

In [12]:
X = all_features_df.drop(["subject"], axis=1)
y = all_labels_df[["activity"]]
X_all = xgb.DMatrix(X, y)

###  Data split

#### Initial Cross-Subject Test (2-fold)

In [34]:
initial_cs_odd = [subject for subject in subjects_list if subject % 2 == 1]
initial_cs_even = [subject for subject in subjects_list if subject % 2 == 0]
initial_cs_split = [
    {"train": initial_cs_odd, "test": initial_cs_even},
    {"train": initial_cs_even, "test": initial_cs_odd}
]

In [13]:
initial_cs_split_fn = "{data_dir}/{data}_split_init_cs.pickle".format(data_dir=PREPROCESSED_DIRECTORY, data=DATA_ALL)

In [36]:
with open(initial_cs_split_fn, "wb") as f:
    pickle.dump(initial_cs_split, f)

#### Complete Cross-Subject Test (2-fold)

In [22]:
complete_cs_train_indices = list(itertools.combinations(subjects_list, int(len(subjects_list)/2)))
complete_cs_split = [
    {"train": list(train_indices), "test": list(set(subjects_list) - set(train_indices))} for train_indices in complete_cs_train_indices
]

In [23]:
complete_cs_split_fn = "{data_dir}/{data}_split_complete_cs.pickle".format(data_dir=PREPROCESSED_DIRECTORY, data=DATA_ALL)

In [24]:
with open(complete_cs_split_fn, "wb") as f:
    pickle.dump(complete_cs_split, f)

#### N-Subject-Fold (12-fold)

In [27]:
n_subject_split = [{"train": list(set(subjects_list) - {test_idx}), "test": [test_idx]} for test_idx in subjects_list]

In [28]:
n_subject_split_fn = "{data_dir}/{data}_split_n_subject.pickle".format(data_dir=PREPROCESSED_DIRECTORY, data=DATA_ALL)

In [29]:
with open(n_subject_split_fn, "wb") as f:
    pickle.dump(n_subject_split, f)

# TRAINING

#### Initial Cross-Subject Test

In [14]:
with open(initial_cs_split_fn, "rb") as f:
    initial_cs_split = pickle.load(f)

In [16]:
# Tune parameters

xgboost_clf = xgb.XGBClassifier(learning_rate=0.3, n_estimators=100, objective="multi:softmax", seed=42)

max_depth = [5, 6, 7, 8]
gamma = [1, 2, 3]
reg_lambda = [1, 2, 3]
reg_alpha = [1, 2, 3]
subsample = [0.5, 0.8, 1]
colsample_bytree = [0.5, 0.8, 1]
colsample_bylevel = [0.5, 0.8, 1]

xgboost_knobs = {
    "max_depth": max_depth,
    "gamma": gamma,
    "reg_lambda": reg_lambda,
    "reg_alpha": reg_alpha,
    "subsample": subsample,
    "colsample_bytree": colsample_bytree,
    "colsample_bylevel": colsample_bylevel
}

initial_cs_train_test_cv = []
for train_test in initial_cs_split:
    train_indices = all_labels_df[all_labels_df["subject"].isin(train_test["train"])].index.tolist()
    test_indices = all_labels_df[all_labels_df["subject"].isin(train_test["test"])].index.tolist()
    initial_cs_train_test_cv.append((train_indices, test_indices))

initial_cs_params_search = model_selection.RandomizedSearchCV(
    xgboost_clf, param_distributions=xgboost_knobs, cv=initial_cs_train_test_cv, n_iter=50, random_state=42, verbose=2, n_jobs=4)

In [17]:
initial_cs_params_search.fit(X.values, y.values.ravel())

Fitting 2 folds for each of 50 candidates, totalling 100 fits


[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed: 18.0min
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed: 51.2min finished


RandomizedSearchCV(cv=[([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, ... 77010, 77011, 77012, 77013, 77014, 77015, 77016, 77017, 77018, 77019, 77020, 77021, 77022, 77023])],
          error_score='raise',
          estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.3, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='multi:softmax', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=42, silent=True, subsample=1),
          fit_params={}, iid=True, n_iter=50, n_jobs=4,
          param_distributions={'max_depth': [5, 6, 7, 8], 'gamma': [1, 2, 3], 'reg_lambda': [1, 2, 3], 'reg_alpha': [1, 2, 3], 'subsample

In [18]:
initial_cs_params_search.best_params_

{'colsample_bylevel': 0.5,
 'colsample_bytree': 0.5,
 'gamma': 2,
 'max_depth': 6,
 'reg_alpha': 1,
 'reg_lambda': 3,
 'subsample': 0.5}

In [19]:
PARAMS = {}
PARAMS["eta"] = 0.3
PARAMS["max_depth"] = 6
PARAMS["gamma"] = 2
PARAMS["lambda"] = 3
PARAMS["alpha"] = 1
PARAMS["subsample"] = 0.5
PARAMS["colsample_bytree"] = 0.5
PARAMS["colsample_bylevel"] = 0.5
PARAMS["objective"] = "multi:softmax"
PARAMS["eval_metric"] = "merror"
PARAMS["num_class"] = len(activities_list)
PARAMS["silent"] = 0
NUM_ROUNDS = 100
EARLYSTOPPING_ROUNDS = 30

In [20]:
# Find best iteration by early stopping

init_cs_results = []

for train_test in initial_cs_split:
    train_indices = all_labels_df[all_labels_df["subject"].isin(train_test["train"])].index.tolist()
    test_indices = all_labels_df[all_labels_df["subject"].isin(train_test["test"])].index.tolist()
    
    X_train = X.iloc[train_indices]
    X_test = X.iloc[test_indices]
    y_train = y.iloc[train_indices]
    y_test = y.iloc[test_indices]
    
    train_dmatrix = xgb.DMatrix(X_train, y_train)
    test_dmatrix = xgb.DMatrix(X_test, y_test)
    watchlist = [(train_dmatrix, "train"), (test_dmatrix, "eval")]
    results = {}
    
    xgb.train(params=PARAMS, dtrain=train_dmatrix, num_boost_round=NUM_ROUNDS, evals=watchlist, evals_result=results,
                early_stopping_rounds=EARLYSTOPPING_ROUNDS)
    
    init_cs_results.append(results["eval"]["merror"])

[0]	train-merror:0.057176	eval-merror:0.270895
Multiple eval metrics have been passed: 'eval-merror' will be used for early stopping.

Will train until eval-merror hasn't improved in 30 rounds.
[1]	train-merror:0.038346	eval-merror:0.231588
[2]	train-merror:0.03122	eval-merror:0.221464
[3]	train-merror:0.029046	eval-merror:0.192756
[4]	train-merror:0.026499	eval-merror:0.185294
[5]	train-merror:0.024238	eval-merror:0.183203
[6]	train-merror:0.021691	eval-merror:0.172461
[7]	train-merror:0.02026	eval-merror:0.171416
[8]	train-merror:0.019516	eval-merror:0.16065
[9]	train-merror:0.017628	eval-merror:0.163716
[10]	train-merror:0.016912	eval-merror:0.159082
[11]	train-merror:0.015853	eval-merror:0.156111
[12]	train-merror:0.015081	eval-merror:0.148625
[13]	train-merror:0.01385	eval-merror:0.141591
[14]	train-merror:0.013507	eval-merror:0.1395
[15]	train-merror:0.013192	eval-merror:0.137883
[16]	train-merror:0.012648	eval-merror:0.131396
[17]	train-merror:0.012133	eval-merror:0.131515
[18]	

In [21]:
# Average across results

init_cs_results_avg = [(x + y) / 2 for (x, y) in zip(*init_cs_results)]

init_cs_min_error = init_cs_results_avg[0]
init_cs_best_iter = 0
init_cs_early_stopping = 0
for i, error in enumerate(init_cs_results_avg):
    if error <= init_cs_min_error:
        init_cs_min_error = error
        init_cs_best_iter = i + 1
        init_cs_early_stopping = 1
    else:
        init_cs_early_stopping += 1
    if init_cs_early_stopping == 30:
        break

In [22]:
# Train Booster

init_cs_booster = xgb.train(params=PARAMS, dtrain=X_all, num_boost_round=init_cs_best_iter)

In [24]:
init_cs_best_iter

41

In [25]:
# Save model

init_cs_booster.save_model("init_cs.model")

In [26]:
# Test accuracy

y_predicted = init_cs_booster.predict(X_all)
accuracy = metrics.accuracy_score(y, y_predicted)
accuracy

0.99078209389281258

#### Complete Cross-Subject Test

In [25]:
with open(complete_cs_split_fn, "rb") as f:
    complete_cs_split = pickle.load(f)

In [40]:
# Save intermediate results

complete_cs_results_csv = "results/complete_cs.csv"
open(complete_cs_results_csv, "w").close()
with open(complete_cs_results_csv, "a") as f:
    complete_cs_results_pdf = pd.DataFrame(columns=["train_index", "tree", "error"])
    complete_cs_results_pdf.to_csv(f, header=True, index=False)

In [41]:
# Find best iteration

for train_index, train_test in enumerate(complete_cs_split):
    train_indices = all_labels_df[all_labels_df["subject"].isin(train_test["train"])].index.tolist()
    test_indices = all_labels_df[all_labels_df["subject"].isin(train_test["test"])].index.tolist()
    
    X_train = X.iloc[train_indices]
    X_test = X.iloc[test_indices]
    y_train = y.iloc[train_indices]
    y_test = y.iloc[test_indices]
    
    train_dmatrix = xgb.DMatrix(X_train, y_train)
    test_dmatrix = xgb.DMatrix(X_test, y_test)
    watchlist = [(train_dmatrix, "train"), (test_dmatrix, "eval")]
    results = {}
    
    complete_cs_booster = xgb.train(params=PARAMS, dtrain=train_dmatrix, num_boost_round=NUM_ROUNDS, evals=watchlist, evals_result=results)
    results_df = pd.DataFrame(columns=["train_index", "tree", "error"])
    for tree, error in enumerate(results["eval"]["merror"]):
        results_df.loc[tree] = [train_index, tree, error]
    
    with open(complete_cs_results_csv, "a") as f:
        results_df.to_csv(f, header=False, index=False)
        
    if train_index == 1:
        break

[0]	train-merror:0.04838	eval-merror:0.338531
[1]	train-merror:0.032147	eval-merror:0.310103
[2]	train-merror:0.025436	eval-merror:0.271463
[3]	train-merror:0.022592	eval-merror:0.259545
[4]	train-merror:0.021665	eval-merror:0.243407
[5]	train-merror:0.019525	eval-merror:0.235141
[6]	train-merror:0.01831	eval-merror:0.226285
[7]	train-merror:0.016968	eval-merror:0.225716
[8]	train-merror:0.015722	eval-merror:0.224185
[9]	train-merror:0.014699	eval-merror:0.225038
[10]	train-merror:0.013677	eval-merror:0.219287
[11]	train-merror:0.012686	eval-merror:0.212049
[12]	train-merror:0.011664	eval-merror:0.211284
[13]	train-merror:0.010897	eval-merror:0.205248
[14]	train-merror:0.010321	eval-merror:0.202887
[15]	train-merror:0.009459	eval-merror:0.20422
[16]	train-merror:0.008915	eval-merror:0.205904
[17]	train-merror:0.008244	eval-merror:0.201137
[18]	train-merror:0.007637	eval-merror:0.200197
[19]	train-merror:0.007062	eval-merror:0.199759
[20]	train-merror:0.006487	eval-merror:0.198579
[21]	