In [152]:
import xgboost as xgb
from sklearn import model_selection
from sklearn import metrics

import numpy as np
import pandas as pd
import os
import pickle
import itertools
import random

np.set_printoptions(precision=2, suppress=True)

# PATHS

In [6]:
DATA_DIRECTORY = "../data/topviewkinect/"

PREPROCESSED_DIRECTORY = DATA_DIRECTORY + "all/"

DATA_ALL = "all"

# PREPROCESSING

### Sanity check

In [29]:
for dataset_id in next(os.walk(DATA_DIRECTORY))[1]:
    if not dataset_id.isdigit():
        continue

    features_csv = "{data_dir}/{dataset_id}/features.csv".format(data_dir=DATA_DIRECTORY, dataset_id=dataset_id)
    features_df = pd.read_csv(features_csv)
    labels_csv = "{data_dir}/{dataset_id}/labels.csv".format(data_dir=DATA_DIRECTORY, dataset_id=dataset_id)
    labels_df = pd.read_csv(labels_csv)
    
    if -1 in labels_df["activity"].values:
        print(dataset_id, "missing labels")
    
    if 1 in labels_df["skeleton_id"].values:
        print(dataset_id, "multiple people labels")

    if 1 in features_df["skeleton_id"].values:
        print(dataset_id, "multiple people features")

print("Done!")

Done!


### All features and labels

In [7]:
ignored_features_cols = ["frame_id", "skeleton_id", "x", "y", "z"]
ignored_labels_cols = ["frame_id", "skeleton_id"]

In [8]:
all_features_csv = "{data_dir}/{data}_features.csv".format(data_dir=PREPROCESSED_DIRECTORY, data=DATA_ALL)
all_labels_csv = "{data_dir}/{data}_labels.csv".format(data_dir=PREPROCESSED_DIRECTORY, data=DATA_ALL)

In [9]:
# Overwrite previous data files
open(all_features_csv, "w").close()
open(all_labels_csv, "w").close()

# Open data files
all_features_f = open(all_features_csv, "a")
all_labels_f = open(all_labels_csv, "a")
header=True

for dataset_id in next(os.walk(DATA_DIRECTORY))[1]:
    if not dataset_id.isdigit():
        continue
    else:
        print(dataset_id, ", ", end="")

    features_csv = "{data_dir}/{dataset_id}/features.csv".format(data_dir=DATA_DIRECTORY, dataset_id=dataset_id)
    features_df = pd.read_csv(features_csv, low_memory=False)
    labels_csv = "{data_dir}/{dataset_id}/labels.csv".format(data_dir=DATA_DIRECTORY, dataset_id=dataset_id)
    labels_df = pd.read_csv(labels_csv)

    # Keep only tracking skeletons
    labels_df = labels_df.loc[labels_df["skeleton_id"] == 0]
    labels_df = labels_df.loc[labels_df["activity"] != 6]
    frame_indices = labels_df["frame_id"].values
    
    features_df = features_df.loc[features_df["frame_id"].isin(frame_indices)]
    labels_df = labels_df.loc[labels_df["frame_id"].isin(features_df["frame_id"].values)]
    
    # Append features and labels
    features_df = features_df.drop(labels=ignored_features_cols, axis=1)
    features_df["subject"] = int(dataset_id)
    features_df = features_df.astype(np.float32)
    features_df.to_csv(all_features_f, header=header, index=False)
    
    labels_df = labels_df.drop(labels=ignored_labels_cols, axis=1)
    labels_df["subject"] = int(dataset_id)
    labels_df = labels_df.astype(np.int)
    labels_df.to_csv(all_labels_f, header=header, index=False)
    
    header = False
    
all_features_f.close()
all_labels_f.close()

print("\nDone!")

1 , 10 , 11 , 12 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 
Done!


### Data overview

In [10]:
all_features_df = pd.read_csv(all_features_csv)
all_labels_df = pd.read_csv(all_labels_csv)

In [11]:
all_features_df.shape

(77024, 73)

In [12]:
all_features_df.head()

Unnamed: 0,layer_area_0,layer_area_1,layer_area_2,layer_contours_0,layer_contours_1,layer_distance_0,layer_distance_1,layer_distance_2,layer_distance_3,layer_distance_4,...,interlayer_pos_16,interlayer_pos_17,extremities0,extreme_infrared_0,extreme_infrared_1,extreme_infrared_2,extreme_infrared_3,extreme_infrared_4,extreme_infrared_5,subject
0,0.297578,0.411765,0.290657,3.0,3.0,16.5529,26.6833,26.019199,26.6833,201.0,...,-26.0,-107.0,4.0,0.0,10.0,11.5,11.5,0.0,11.5,1.0
1,0.310345,0.419238,0.270417,3.0,3.0,16.401199,26.4764,26.019199,26.4764,191.5,...,-26.0,-105.0,5.0,0.5,9.0,11.0,1.0,0.5,11.0,1.0
2,0.318015,0.386029,0.295956,3.0,3.0,16.1245,26.2488,27.018499,26.2488,174.5,...,-26.0,-104.0,5.0,0.0,12.5,4.5,4.5,0.5,13.0,1.0
3,0.348399,0.384181,0.26742,3.0,3.0,16.401199,26.419701,26.4764,26.419701,164.0,...,-25.0,-103.0,5.0,0.0,6.0,4.5,0.0,0.0,7.0,1.0
4,0.356383,0.370567,0.27305,3.0,3.0,17.719999,27.459101,27.459101,27.459101,164.5,...,-26.0,-107.0,3.0,0.0,0.0,0.5,0.0,0.0,0.5,1.0


In [13]:
all_labels_df.shape

(77024, 4)

In [14]:
all_labels_df.head()

Unnamed: 0,activity,orientation,orientation_accurate,subject
0,0,130,-1,1
1,0,130,-1,1
2,0,120,-1,1
3,0,130,-1,1
4,0,150,-1,1


In [15]:
subjects_list = np.unique(all_labels_df["subject"])
subjects_list

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12], dtype=int64)

In [16]:
activities_list = np.unique(all_labels_df["activity"])
activities_list

array([0, 1, 2, 3, 4, 5], dtype=int64)

In [141]:
X = all_features_df.drop(labels="subject", axis=1).values
y = all_labels_df["activity"].values.ravel()
all_dmatrix = xgb.DMatrix(X, y)

###  Data split

#### Initial Cross-Subject Test

In [25]:
# odd-numbered subjects as training subject

initial_cs_odd = [subject for subject in subjects_list if subject % 2 == 1]
initial_cs_even = [subject for subject in subjects_list if subject % 2 == 0]
initial_cs_split = [
    {"train": initial_cs_odd, "test": initial_cs_even}
]

In [26]:
INITIAL_CS_SPLIT_FN = "{data_dir}/{data}_split_init_cs.pickle".format(data_dir=PREPROCESSED_DIRECTORY, data=DATA_ALL)

In [27]:
with open(INITIAL_CS_SPLIT_FN, "wb") as f:
    pickle.dump(initial_cs_split, f)

#### Complete Cross-Subject Test

In [22]:
complete_cs_train_indices = list(itertools.combinations(subjects_list, int(len(subjects_list)/2)))
complete_cs_split = [
    {
        "train": list(train_indices),
        "test": list(set(subjects_list) - set(train_indices))
    } for train_indices in complete_cs_train_indices
]

In [13]:
complete_cs_split_fn = "{data_dir}/{data}_split_complete_cs.pickle".format(data_dir=PREPROCESSED_DIRECTORY, data=DATA_ALL)

In [24]:
with open(complete_cs_split_fn, "wb") as f:
    pickle.dump(complete_cs_split, f)

#### N-Subject-Fold (10-fold)

In [27]:
n_subject_split = [
    {
        "train": list(set(subjects_list) - {test_idx}),
        "test": [test_idx]
    } for test_idx in subjects_list
]

In [28]:
n_subject_split_fn = "{data_dir}/{data}_split_n_subject.pickle".format(data_dir=PREPROCESSED_DIRECTORY, data=DATA_ALL)

In [29]:
with open(n_subject_split_fn, "wb") as f:
    pickle.dump(n_subject_split, f)

# TRAINING

#### Initial Cross-Subject Test

In [29]:
with open(INITIAL_CS_SPLIT_FN, "rb") as f:
    initial_cs_split = pickle.load(f)

In [30]:
initial_cs_split

[{'test': [2, 4, 6, 8, 10, 12], 'train': [1, 3, 5, 7, 9, 11]}]

In [118]:
initial_cs_X_train_df = all_features_df[all_features_df["subject"].isin(initial_cs_split[0]["train"])].reset_index()
initial_cs_y_train_df = all_labels_df[all_labels_df["subject"].isin(initial_cs_split[0]["train"])].reset_index()

initial_cs_X_test_df = all_features_df[all_features_df["subject"].isin(initial_cs_split[0]["test"])].reset_index()
initial_cs_y_test_df = all_labels_df[all_labels_df["subject"].isin(initial_cs_split[0]["test"])].reset_index()

In [121]:
# Cross Validation - tune parameters

xgboost_clf = xgb.XGBClassifier(learning_rate=0.3, n_estimators=100, objective="multi:softmax", seed=42)

xgboost_knobs = {
    "max_depth": [5, 6, 7, 8],
    "gamma": [1, 2, 3],
    "reg_lambda": [1, 2, 3],
    "reg_alpha": [1, 2, 3],
    "subsample": [0.5, 0.8, 1],
    "colsample_bytree": [0.5, 0.8, 1],
    "colsample_bylevel": [0.5, 0.8, 1]
}

# 5-fold-subject CV 
initial_cs_train_val_cv = []
for subject_idx in initial_cs_split[0]["train"]:
    train_indices = initial_cs_y_train_df[initial_cs_y_train_df["subject"] != subject_idx].index.tolist()
    validation_indices = initial_cs_y_train_df[initial_cs_y_train_df["subject"] == subject_idx].index.tolist()
    initial_cs_train_val_cv.append((train_indices, validation_indices))

initial_cs_params_search = model_selection.RandomizedSearchCV(
    xgboost_clf, param_distributions=xgboost_knobs, cv=initial_cs_train_val_cv,
    n_iter=50, random_state=42, verbose=2, n_jobs=4
)

In [119]:
# Get X and y values

initial_cs_X_train = initial_cs_X_train_df.drop(labels="subject", axis=1).values
initial_cs_y_train = initial_cs_y_train_df["activity"].values.ravel()

initial_cs_X_test = initial_cs_X_test_df.drop(labels="subject", axis=1).values
initial_cs_y_test = initial_cs_y_test_df["activity"].values.ravel()

In [107]:
initial_cs_params_search.fit(initial_cs_X_train, initial_cs_y_train)

Fitting 6 folds for each of 50 candidates, totalling 300 fits


[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed: 12.5min
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed: 58.8min
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed: 110.2min finished


RandomizedSearchCV(cv=[([5589, 5590, 5591, 5592, 5593, 5594, 5595, 5596, 5597, 5598, 5599, 5600, 5601, 5602, 5603, 5604, 5605, 5606, 5607, 5608, 5609, 5610, 5611, 5612, 5613, 5614, 5615, 5616, 5617, 5618, 5619, 5620, 5621, 5622, 5623, 5624, 5625, 5626, 5627, 5628, 5629, 5630, 5631, 5632, 5633, 5634, 5635, 5636, 5637, ... 15105, 15106, 15107, 15108, 15109, 15110, 15111, 15112, 15113, 15114, 15115, 15116, 15117, 15118])],
          error_score='raise',
          estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.3, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='multi:softmax', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=42, silent=True, subsample=1),
          fit_params={}, iid=True, n_iter=50, n_jobs=4,
          param_distributions={'max_depth': [5, 6, 7, 8], 'gamma': [1, 2, 3], 'reg_lambda': [1, 2, 3], 'reg_alpha': [1, 2, 3], 'subsample

In [110]:
with open("results/init_cs_params", "wb") as f:
    pickle.dump(initial_cs_params_search, f)

In [109]:
initial_cs_params_search.best_params_

{'colsample_bylevel': 0.5,
 'colsample_bytree': 0.5,
 'gamma': 2,
 'max_depth': 8,
 'reg_alpha': 1,
 'reg_lambda': 3,
 'subsample': 0.8}

In [144]:
INITIAL_CS_PARAMS = {
    "eta": 0.3,
    "max_depth": 8,
    "gamma": 2,
    "lambda": 3,
    "alpha": 1,
    "subsample": 0.8,
    "colsample_bytree": 0.5,
    "colsample_bylevel": 0.5,
    "objective": "multi:softmax",
    "eval_metric": "merror",
    "num_class": len(activities_list),
    "silent": 0
}
NUM_ROUNDS = 100
NUM_EARLYSTOPPING_ROUNDS = 30

In [117]:
x = np.array([[1,1,1],[2,2,2],[3,3,3]])


array([[1, 1, 1],
       [3, 3, 3]])

In [128]:
# Find best number of iterations(boosting rounds)

initial_cs_boosting_errors = []

for cv_index, (train_indices, validation_indices) in enumerate(initial_cs_train_val_cv):
    X_train = np.take(initial_cs_X_train, train_indices, axis=0)
    y_train = np.take(initial_cs_y_train, train_indices, axis=0)
    X_validation = np.take(initial_cs_X_train, validation_indices, axis=0)
    y_validation = np.take(initial_cs_y_train, validation_indices, axis=0)
    
    train_dmatrix = xgb.DMatrix(X_train, y_train)
    validation_dmatrix = xgb.DMatrix(X_validation, y_validation)
    watchlist = [(train_dmatrix, "train"), (validation_dmatrix, "eval")]
    results = {}
    
    xgb.train(params=INITIAL_CS_PARAMS, dtrain=train_dmatrix, evals=watchlist, evals_result=results,
                 num_boost_round=NUM_ROUNDS, verbose_eval=NUM_ROUNDS/2)
    initial_cs_boosting_errors.append(results["eval"]["merror"])
    print("CV - finding best number of boosting rounds", cv_index)

0
1
2
3
4
5


In [136]:
# Average across results

initial_cs_boosting_errors_avg = [sum(errors) / len(initial_cs_train_val_cv) for errors in zip(*initial_cs_boosting_errors)]
initial_cs_boosting_errors_avg

initial_cs_min_error = initial_cs_boosting_errors_avg[0]
initial_cs_num_rounds = 0
initial_cs_early_stopping = 0
for boosting_round, error in enumerate(initial_cs_boosting_errors_avg):
    if error <= initial_cs_min_error:
        initial_cs_min_error = error
        initial_cs_num_rounds = boosting_round + 1
        initial_cs_early_stopping = 1
    else:
        initial_cs_early_stopping += 1
    if initial_cs_early_stopping == 30:
        break

In [137]:
initial_cs_num_rounds

18

In [139]:
# Testing set accuracy

initial_cs_train_dmatrix = xgb.DMatrix(initial_cs_X_train, initial_cs_y_train)
initial_cs_test_dmatrix = xgb.DMatrix(initial_cs_X_test, initial_cs_y_test)

initial_cs_booster = xgb.train(params=INITIAL_CS_PARAMS, dtrain=initial_cs_train_dmatrix,
                               um_boost_round=initial_cs_num_rounds)

initial_cs_y_predicted = initial_cs_booster.predict(initial_cs_test_dmatrix)

0.861427315288


In [153]:
initial_cs_accuracy = metrics.accuracy_score(initial_cs_y_test, initial_cs_y_predicted)
print(initial_cs_accuracy)

initial_cs_cm = metrics.confusion_matrix(initial_cs_y_test, initial_cs_y_predicted)
initial_cs_cm = initial_cs_cm.astype("float") / initial_cs_cm.sum(axis=1)[:, np.newaxis]
initial_cs_cm *= 100
print(initial_cs_cm)

0.861427315288
[[ 96.88   0.04   0.     1.4    1.4    0.28]
 [  0.68  97.85   0.47   0.     0.99   0.01]
 [  0.42   6.26  93.2    0.     0.02   0.1 ]
 [ 39.52   0.02   0.    56.06   3.79   0.61]
 [  2.85   0.05   0.     7.33  87.47   2.3 ]
 [ 18.74   1.19   0.07   1.68   0.36  77.97]]


In [145]:
# Train final Booster

initial_cs_booster = xgb.train(params=INITIAL_CS_PARAMS, dtrain=X_all, num_boost_round=initial_cs_num_rounds)

In [146]:
# Save model

initial_cs_booster.save_model("initial_cs.model")

#### Complete Cross-Subject Test

In [17]:
with open(complete_cs_split_fn, "rb") as f:
    complete_cs_split = pickle.load(f)

In [37]:
# Tune parameters

xgboost_clf = xgb.XGBClassifier(learning_rate=0.3, n_estimators=100, objective="multi:softmax", seed=42)

xgboost_knobs = {
    "max_depth": [5, 6, 7, 8],
    "gamma": [1, 2, 3],
    "reg_lambda": [1, 2, 3],
    "reg_alpha": [1, 2, 3],
    "subsample": [0.5, 0.8, 1],
    "colsample_bytree": [0.5, 0.8, 1],
    "colsample_bylevel": [0.5, 0.8, 1]
}

complete_cs_train_test_cv = []
for train_test in complete_cs_split:
    train_indices = all_labels_df[all_labels_df["subject"].isin(train_test["train"])].index.tolist()
    test_indices = all_labels_df[all_labels_df["subject"].isin(train_test["test"])].index.tolist()
    complete_cs_train_test_cv.append((train_indices, test_indices))

# Get 100 random splits
random.shuffle(complete_cs_train_test_cv)
complete_cs_train_test_cv = complete_cs_train_test_cv[0:100]

In [35]:
complete_cs_params_search = model_selection.RandomizedSearchCV(
    xgboost_clf, param_distributions=xgboost_knobs, cv=complete_cs_train_test_cv, n_iter=20, random_state=42, verbose=2, n_jobs=4)

In [38]:
complete_cs_params_search.fit(X.values, y.values.ravel())

Fitting 100 folds for each of 20 candidates, totalling 2000 fits


[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed: 18.5min
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed: 76.3min
[Parallel(n_jobs=4)]: Done 357 tasks      | elapsed: 154.5min
[Parallel(n_jobs=4)]: Done 640 tasks      | elapsed: 268.7min
[Parallel(n_jobs=4)]: Done 1005 tasks      | elapsed: 415.8min
[Parallel(n_jobs=4)]: Done 1450 tasks      | elapsed: 618.9min
[Parallel(n_jobs=4)]: Done 1977 tasks      | elapsed: 887.9min
[Parallel(n_jobs=4)]: Done 2000 out of 2000 | elapsed: 899.8min finished


RandomizedSearchCV(cv=[([16318, 16319, 16320, 16321, 16322, 16323, 16324, 16325, 16326, 16327, 16328, 16329, 16330, 16331, 16332, 16333, 16334, 16335, 16336, 16337, 16338, 16339, 16340, 16341, 16342, 16343, 16344, 16345, 16346, 16347, 16348, 16349, 16350, 16351, 16352, 16353, 16354, 16355, 16356, 16357, 16358, 16359, ... 72277, 72278, 72279, 72280, 72281, 72282, 72283, 72284, 72285, 72286, 72287, 72288, 72289, 72290])],
          error_score='raise',
          estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.3, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='multi:softmax', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=42, silent=True, subsample=1),
          fit_params={}, iid=True, n_iter=20, n_jobs=4,
          param_distributions={'max_depth': [5, 6, 7, 8], 'gamma': [1, 2, 3], 'reg_lambda': [1, 2, 3], 'reg_alpha': [1, 2, 3], 'subsample

In [39]:
with open("results/complete_cs_params", "wb") as f:
    pickle.dump(complete_cs_params_search, f)

In [40]:
complete_cs_params_search.best_params_

{'colsample_bylevel': 0.5,
 'colsample_bytree': 0.8,
 'gamma': 1,
 'max_depth': 5,
 'reg_alpha': 1,
 'reg_lambda': 3,
 'subsample': 0.5}

In [15]:
with open("results/complete_cs_params", "rb") as f:
    complete_cs_params_search = pickle.load(f)

In [25]:
COMPLETE_CS_PARAMS = {}
COMPLETE_CS_PARAMS["eta"] = 0.3
COMPLETE_CS_PARAMS["max_depth"] = 5
COMPLETE_CS_PARAMS["gamma"] = 1
COMPLETE_CS_PARAMS["lambda"] = 3
COMPLETE_CS_PARAMS["alpha"] = 1
COMPLETE_CS_PARAMS["subsample"] = 0.5
COMPLETE_CS_PARAMS["colsample_bytree"] = 0.8
COMPLETE_CS_PARAMS["colsample_bylevel"] = 0.5
COMPLETE_CS_PARAMS["objective"] = "multi:softmax"
COMPLETE_CS_PARAMS["eval_metric"] = "merror"
COMPLETE_CS_PARAMS["num_class"] = len(activities_list)
COMPLETE_CS_PARAMS["silent"] = 1
NUM_ROUNDS = 100
EARLYSTOPPING_ROUNDS = 30

In [29]:
# Find best iteration by early stopping

complete_cs_results = []

for train_test_idx, train_test in enumerate(complete_cs_split):
    train_indices = all_labels_df[all_labels_df["subject"].isin(train_test["train"])].index.tolist()
    test_indices = all_labels_df[all_labels_df["subject"].isin(train_test["test"])].index.tolist()
    
    X_train = X.iloc[train_indices]
    X_test = X.iloc[test_indices]
    y_train = y.iloc[train_indices]
    y_test = y.iloc[test_indices]
    
    train_dmatrix = xgb.DMatrix(X_train, y_train)
    test_dmatrix = xgb.DMatrix(X_test, y_test)
    watchlist = [(train_dmatrix, "train"), (test_dmatrix, "eval")]
    results = {}
    
    booster = xgb.train(params=COMPLETE_CS_PARAMS, dtrain=train_dmatrix, num_boost_round=NUM_ROUNDS, 
                        evals=watchlist, evals_result=results, verbose_eval=50)
    
    complete_cs_results.append(results["eval"]["merror"])
    print(train_test_idx)

[0]	train-merror:0.054004	eval-merror:0.308463
[50]	train-merror:0.004218	eval-merror:0.186267
0
[0]	train-merror:0.048274	eval-merror:0.335839
[50]	train-merror:0.004711	eval-merror:0.198976
1
[0]	train-merror:0.054557	eval-merror:0.376124
[50]	train-merror:0.004503	eval-merror:0.171953
2
[0]	train-merror:0.040945	eval-merror:0.347736
[50]	train-merror:0.004188	eval-merror:0.228316
3
[0]	train-merror:0.048801	eval-merror:0.317888
[50]	train-merror:0.004767	eval-merror:0.238868
4
[0]	train-merror:0.064037	eval-merror:0.280673
[50]	train-merror:0.004443	eval-merror:0.163558
5
[0]	train-merror:0.061728	eval-merror:0.262622
[50]	train-merror:0.004527	eval-merror:0.157888
6
[0]	train-merror:0.051313	eval-merror:0.285424
[50]	train-merror:0.004191	eval-merror:0.134375
7
[0]	train-merror:0.052578	eval-merror:0.280758
[50]	train-merror:0.00426	eval-merror:0.124918
8
[0]	train-merror:0.040972	eval-merror:0.273129
[50]	train-merror:0.003907	eval-merror:0.18175
9
[0]	train-merror:0.057028	eval-m

In [30]:
with open("results/complete_cs_trees", "wb") as f:
    pickle.dump(complete_cs_results, f)

In [40]:
# Save intermediate results

complete_cs_results_csv = "results/complete_cs.csv"
open(complete_cs_results_csv, "w").close()
with open(complete_cs_results_csv, "a") as f:
    complete_cs_results_pdf = pd.DataFrame(columns=["train_index", "tree", "error"])
    complete_cs_results_pdf.to_csv(f, header=True, index=False)

In [41]:
# Find best iteration

for train_index, train_test in enumerate(complete_cs_split):
    train_indices = all_labels_df[all_labels_df["subject"].isin(train_test["train"])].index.tolist()
    test_indices = all_labels_df[all_labels_df["subject"].isin(train_test["test"])].index.tolist()
    
    X_train = X.iloc[train_indices]
    X_test = X.iloc[test_indices]
    y_train = y.iloc[train_indices]
    y_test = y.iloc[test_indices]
    
    train_dmatrix = xgb.DMatrix(X_train, y_train)
    test_dmatrix = xgb.DMatrix(X_test, y_test)
    watchlist = [(train_dmatrix, "train"), (test_dmatrix, "eval")]
    results = {}
    
    complete_cs_booster = xgb.train(params=PARAMS, dtrain=train_dmatrix, num_boost_round=NUM_ROUNDS, evals=watchlist, evals_result=results)
    results_df = pd.DataFrame(columns=["train_index", "tree", "error"])
    for tree, error in enumerate(results["eval"]["merror"]):
        results_df.loc[tree] = [train_index, tree, error]
    
    with open(complete_cs_results_csv, "a") as f:
        results_df.to_csv(f, header=False, index=False)
        
    if train_index == 1:
        break

[0]	train-merror:0.04838	eval-merror:0.338531
[1]	train-merror:0.032147	eval-merror:0.310103
[2]	train-merror:0.025436	eval-merror:0.271463
[3]	train-merror:0.022592	eval-merror:0.259545
[4]	train-merror:0.021665	eval-merror:0.243407
[5]	train-merror:0.019525	eval-merror:0.235141
[6]	train-merror:0.01831	eval-merror:0.226285
[7]	train-merror:0.016968	eval-merror:0.225716
[8]	train-merror:0.015722	eval-merror:0.224185
[9]	train-merror:0.014699	eval-merror:0.225038
[10]	train-merror:0.013677	eval-merror:0.219287
[11]	train-merror:0.012686	eval-merror:0.212049
[12]	train-merror:0.011664	eval-merror:0.211284
[13]	train-merror:0.010897	eval-merror:0.205248
[14]	train-merror:0.010321	eval-merror:0.202887
[15]	train-merror:0.009459	eval-merror:0.20422
[16]	train-merror:0.008915	eval-merror:0.205904
[17]	train-merror:0.008244	eval-merror:0.201137
[18]	train-merror:0.007637	eval-merror:0.200197
[19]	train-merror:0.007062	eval-merror:0.199759
[20]	train-merror:0.006487	eval-merror:0.198579
[21]	