In [1]:
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
%matplotlib inline

  from pandas import MultiIndex, Int64Index


In [4]:
#Load csv
EXERCISE_DATA = pd.read_csv(r'DATASETS\EXCERCISE\EXERCISE_PATTERNS_DATASET.csv')

In [5]:
EXERCISE_DATA.head

<bound method NDFrame.head of      TotalSteps  TotalDistance  Calories  SCORE
0         13162       8.500000      1985      0
1         10735       6.970000      1797      0
2         10460       6.740000      1776      0
3          9762       6.280000      1745      0
4         12669       8.160000      1863      0
..          ...            ...       ...    ...
935       10686       8.110000      2847      0
936       20226      18.250000      3710      0
937       10733       8.150000      2832      0
938       21420      19.559999      3832      0
939        8064       6.120000      1849      0

[940 rows x 4 columns]>

In [6]:
#Create dictionary of ideal values for each score; calculate L2 norm to generate score)
IDEAL_VALS = {'TotalSteps': 10000, 'TotalDistance': 8, 'Calories': 450}
columns = EXERCISE_DATA.columns.values.tolist()
columns = columns[:4]
#Generate scores for each lever - compare iteratively and generate average score on its basis
def generate_score(row):
    #Iterate over each column except for gender
    print(" =================================== ")
    total_score = 0
    for column in columns:
        if column == "SCORE": continue
        print(row[column], IDEAL_VALS[column])
        column_score = abs(row[column] - IDEAL_VALS[column])
        print(column_score)
        total_score += column_score
    #Average out the score
    row["SCORE"] = total_score / len(columns)
    print('END SCORE: ', row["SCORE"])
    return row

EXERCISE_DATA = EXERCISE_DATA.apply(generate_score, axis = "columns")

13162.0 10000
3162.0
8.5 8
0.5
1985.0 450
1535.0
END SCORE:  1174.375
10735.0 10000
735.0
6.96999979 8
1.0300002099999999
1797.0 450
1347.0
END SCORE:  520.7575000525001
10460.0 10000
460.0
6.739999771 8
1.260000229
1776.0 450
1326.0
END SCORE:  446.81500005725
9762.0 10000
238.0
6.28000021 8
1.7199997900000001
1745.0 450
1295.0
END SCORE:  383.6799999475
12669.0 10000
2669.0
8.159999847 8
0.15999984699999992
1863.0 450
1413.0
END SCORE:  1020.53999996175
9705.0 10000
295.0
6.480000019 8
1.5199999809999998
1728.0 450
1278.0
END SCORE:  393.62999999525
13019.0 10000
3019.0
8.590000153 8
0.5900001530000001
1921.0 450
1471.0
END SCORE:  1122.64750003825
15506.0 10000
5506.0
9.880000114 8
1.8800001139999996
2035.0 450
1585.0
END SCORE:  1773.2200000285
10544.0 10000
544.0
6.679999828 8
1.3200001720000003
1786.0 450
1336.0
END SCORE:  470.33000004300004
9819.0 10000
181.0
6.340000153 8
1.659999847
1775.0 450
1325.0
END SCORE:  376.91499996175
12764.0 10000
2764.0
8.130000114 8
0.13000011399

In [8]:
print(EXERCISE_DATA)

     TotalSteps  TotalDistance  Calories      SCORE
0       13162.0       8.500000    1985.0  1174.3750
1       10735.0       6.970000    1797.0   520.7575
2       10460.0       6.740000    1776.0   446.8150
3        9762.0       6.280000    1745.0   383.6800
4       12669.0       8.160000    1863.0  1020.5400
..          ...            ...       ...        ...
935     10686.0       8.110000    2847.0   770.7775
936     20226.0      18.250000    3710.0  3374.0625
937     10733.0       8.150000    2832.0   778.7875
938     21420.0      19.559999    3832.0  3703.3900
939      8064.0       6.120000    1849.0   834.2200

[940 rows x 4 columns]


In [7]:
EXERCISE_DATA.to_pickle('EXERCISE_DATA.pkl')

In [9]:
#Reshape tensors for XGBOOST
import tensorflow as tf
from sklearn.model_selection import train_test_split

x_features = EXERCISE_DATA.iloc[:, :3]
print(x_features)
y_labels = EXERCISE_DATA["SCORE"]
train_x, valid_x, train_y, valid_y = train_test_split(x_features, y_labels, random_state = 2, shuffle = True)

train_x = tf.convert_to_tensor(train_x)
train_x = tf.reshape(train_x, [len(train_x), 3])

valid_x = tf.convert_to_tensor(valid_x)
valid_x = tf.reshape(valid_x, [len(valid_x), 3])

train_y = tf.convert_to_tensor(train_y)
train_y = tf.reshape(train_y, [len(train_y), 1])

valid_y = tf.convert_to_tensor(valid_y)
valid_y = tf.reshape(valid_y, [len(valid_y), 1])

     TotalSteps  TotalDistance  Calories
0       13162.0       8.500000    1985.0
1       10735.0       6.970000    1797.0
2       10460.0       6.740000    1776.0
3        9762.0       6.280000    1745.0
4       12669.0       8.160000    1863.0
..          ...            ...       ...
935     10686.0       8.110000    2847.0
936     20226.0      18.250000    3710.0
937     10733.0       8.150000    2832.0
938     21420.0      19.559999    3832.0
939      8064.0       6.120000    1849.0

[940 rows x 3 columns]


In [15]:
#TRAIN GRADIENT BOOSTED TREE
from sklearn.metrics import mean_squared_error, mean_absolute_error

#Convert to Dmatrix
TRAIN_DATA = xgb.DMatrix(train_x, train_y, feature_names = columns[1:15])
VALID_DATA = xgb.DMatrix(valid_x, valid_y, feature_names = columns[1:15])
#Parameters for boosted tree
XGBOOST_PARAMS = {"objective": "reg:squarederror", "subsample": 0.6,
                  "colsample_bytree" : 0.6, "learning_rate" : 0.1, "max_depth" : 100,
                  "alpha": 20, "n_estimators": 12}
#Train the xgboost model
XGB_EXERCISE_MODEL = xgb.train(XGBOOST_PARAMS, TRAIN_DATA, evals = [(TRAIN_DATA, "TRAIN_DATA"), (VALID_DATA, "VALID_DATA")],
                            num_boost_round = 200, early_stopping_rounds = 40)
predictions = XGB_EXERCISE_MODEL.predict(VALID_DATA)
error = mean_squared_error(valid_y, predictions)
print('ERROR: ', error)

Parameters: { "n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	TRAIN_DATA-rmse:1580.39172	VALID_DATA-rmse:1725.05078
[1]	TRAIN_DATA-rmse:1451.95972	VALID_DATA-rmse:1600.12353
[2]	TRAIN_DATA-rmse:1313.94580	VALID_DATA-rmse:1455.08301
[3]	TRAIN_DATA-rmse:1214.57495	VALID_DATA-rmse:1356.76306
[4]	TRAIN_DATA-rmse:1100.05444	VALID_DATA-rmse:1233.14831
[5]	TRAIN_DATA-rmse:997.12744	VALID_DATA-rmse:1115.24060
[6]	TRAIN_DATA-rmse:926.65509	VALID_DATA-rmse:1047.81226
[7]	TRAIN_DATA-rmse:843.45557	VALID_DATA-rmse:959.66650
[8]	TRAIN_DATA-rmse:769.42969	VALID_DATA-rmse:881.08673
[9]	TRAIN_DATA-rmse:717.57855	VALID_DATA-rmse:830.34253
[10]	TRAIN_DATA-rmse:668.62183	VALID_DATA-rmse:784.49890
[11]	TRAIN_DATA-rmse:611.00458	VALID_DATA-rmse:719.3

In [16]:
#Save Model
XGB_EXERCISE_MODEL.save_model('METRICS_EXERCISE_GRADBOOSTED_MODELS[41443].model')

In [None]:
# #Print desicion tree -> transparency into exactly what the model is doing
ig, ax = plt.subplots(figsize=(30, 30))
xgb.plot_tree(XGB_SLEEP_MODEL, num_trees=10, ax=ax)
plt.show()