# Gradient Boosted Regression Trees
---

#### Data Classification Task - Model Training and Testing

### Imports
---

In [21]:
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt

from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import mean_squared_error, accuracy_score

from imblearn.combine import SMOTEENN

### Data Preprocessing
---
Where the ✨ magic ✨ happens

In [22]:
data = pd.read_pickle('../training_dataset_task3/task_3_training_e8da4715deef7d56_f8b7378_pandas.pkl')
df = data.loc[:,'pianist_id':'arousal']

In [23]:
# drop non-required features
X = df.drop(['quadrant', 'valence', 'arousal'], axis=1)
y = df['quadrant']

# normalize data
X_std = StandardScaler().fit_transform(X)
X = pd.DataFrame(X_std, columns=X.columns)
X = X.loc[:, "essentia_dissonance_mean":"mirtoolbox_roughness_pct_90"]

# shuffle data
X, y = shuffle(X, y , random_state=13)

# smote enn
smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)

# split data into training-/test-set
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [24]:
X_train

Unnamed: 0,essentia_dissonance_mean,essentia_dissonance_stdev,essentia_dynamic_complexity,essentia_loudness,essentia_onset_rate,essentia_pitch_salience_mean,essentia_pitch_salience_stdev,essentia_spectral_centroid_mean,essentia_spectral_centroid_stdev,essentia_spectral_complexity_mean,...,mirtoolbox_novelty_std,mirtoolbox_novelty_pct_10,mirtoolbox_novelty_pct_50,mirtoolbox_novelty_pct_90,mirtoolbox_pulseclarity,mirtoolbox_roughness_mean,mirtoolbox_roughness_std,mirtoolbox_roughness_pct_10,mirtoolbox_roughness_pct_50,mirtoolbox_roughness_pct_90
78,0.652131,-0.395405,-0.947066,0.617313,1.135133,0.411921,-1.141278,-0.052817,0.969386,-0.611714,...,-0.283257,-0.132523,0.128592,-0.066988,0.500024,0.502911,-0.520811,0.824101,0.485116,0.224715
29,0.270427,-0.674596,-0.673356,0.982513,-0.309247,0.018125,-0.097964,0.246241,-0.265885,-1.205762,...,-0.986785,-0.132523,-1.021422,-1.318619,-1.121356,-0.334504,-0.000192,-0.238357,-0.422630,-0.189874
280,-1.648462,-0.008504,-0.808687,-0.818066,-0.523434,-0.284987,-1.463189,-0.800746,0.198840,0.853732,...,0.891188,-0.132523,-0.331523,0.588847,-0.717868,0.145573,0.007299,0.115105,0.142528,0.045589
507,1.295009,-0.834411,-0.743406,2.526212,1.040166,0.884361,-1.278342,0.355783,-0.348396,0.482245,...,0.482468,-0.132523,0.815806,0.587096,0.252162,1.116314,-0.119981,1.321130,1.120894,0.876903
652,0.677167,0.071451,-0.139664,0.600671,0.065922,0.265566,-0.228369,-0.127591,-0.278460,-0.659449,...,-1.356160,-0.132523,-0.643391,-1.019313,2.646946,0.215632,0.050398,0.207223,0.225942,0.176425
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,0.582137,-0.013801,-0.213219,-0.488757,0.239327,0.004035,0.506741,-0.158201,-0.486893,0.544330,...,-0.211979,-0.132523,0.027178,0.212757,0.326150,0.257034,1.770699,-0.064601,0.033253,0.781597
270,-0.221779,0.842557,0.235892,0.081516,-1.041193,-0.252875,0.526253,-0.185498,-0.216726,-0.920352,...,0.570368,-0.132523,-0.527514,0.306412,-0.070935,-0.806342,-0.918741,-0.578591,-0.775367,-0.962225
860,0.174741,0.626478,3.544625,-0.313134,0.678186,0.792448,0.523175,-0.320751,-0.541040,0.401094,...,0.779267,-0.132523,-0.025593,0.591658,-0.585584,-0.164251,2.033246,-0.666144,-0.470561,0.869274
435,1.053793,-0.742868,-0.749657,1.549309,-0.089818,0.589554,-0.752958,0.005411,-0.302675,0.669336,...,-1.062230,-0.132523,-0.945741,-0.586691,-0.464582,1.783589,-0.631891,2.134855,1.829304,1.247930


### Model Traing and Evaluation
---

In [25]:
def evaluate_model(model, X_test, y_test):
    """
    Evaluate model performance
    :param model:
    :param X_test:
    :param y_test:
    :return:
    """
    test_yhat = model.predict(X_test)
    errors = abs(test_yhat - y_test)
    # mape = 100 * np.mean(errors / y_test)

    train_yhat = model.predict(X_train)
    test_yhat = model.predict(X_test)

    print('Model Performance Check:')
    print("***"*3)
    print('Average Error: {:0.4f}'.format(np.mean(errors)))
    print('MSE: {:0.4f}'.format(mean_squared_error(y_test, model.predict(X_test))))
    # print('Accuracy = {:0.2f}%.'.format(accuracy))
    print("\nModel Overfitting Check:")
    print("***"*3)
    print("Trainset Accuracy: {:0.4f}".format(accuracy_score(y_train, train_yhat)))
    print("Testset Accuracy: {:0.4f}".format(accuracy_score(y_test, test_yhat)))

def export_model(model, model_name: str):
    """
    Exports a given classifier to a pickel file
    :param model:
    :param model_name:
    :return: None
    """
    if model is not None:
        try:
            joblib.dump(gbrt_model, '{}.pkl'.format(model_name), compress=9)
            print("Successfully exported classifier!\n Location: {}")
        except Exception as ex:
            raise ex
    else:
        raise ValueError("Model is none")

In [26]:
# Gradient Boosted Regression Tree
gbrt_model = GradientBoostingClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=50,
    min_samples_leaf=2,
    random_state=0).fit(X_train, y_train)

gbrt_model.fit(X_train,y_train)

In [27]:
evaluate_model(gbrt_model, X_test, y_test)

Model Performance Check:
*********
Average Error: 0.2736
MSE: 0.5124

Model Overfitting Check:
*********
Trainset Accuracy: 1.0000
Testset Accuracy: 0.8209


### Iterative Evaluation of Tree Depth
---

In [43]:
# # define lists to collect scores
# train_scores, test_scores = list(), list()
# # define the tree depths to evaluate
# values = [i for i in range(40, 51)]
#
# for i in values:
# 	# configure the model
# 	model = GradientBoostingClassifier(
#         n_estimators=100,
#         learning_rate=0.1,
#         max_depth=i,
#         min_samples_leaf=2,
#         random_state=0).fit(X_train, y_train)
#
# 	# fit model on the training dataset
# 	model.fit(X_train, y_train)
# 	# evaluate on the train dataset
# 	train_yhat = model.predict(X_train)
# 	train_acc = accuracy_score(y_train, train_yhat)
# 	train_scores.append(train_acc)
# 	# evaluate on the test dataset
# 	test_yhat = model.predict(X_test)
# 	test_acc = accuracy_score(y_test, test_yhat)
# 	test_scores.append(test_acc)
# 	# summarize progress
# 	print('>%d, train: %.3f, test: %.3f' % (i, train_acc, test_acc))

>40, train: 1.000, test: 0.925


KeyboardInterrupt: 

In [None]:
# # plot of train and test scores vs tree depth
# plt.plot(values, train_scores, '-o', label='Train')
# plt.plot(values, test_scores, '-o', label='Test')
# plt.title("Gradient Boosted Regression Tree")
# plt.xlabel("Tree Depth")
# plt.ylabel("Accuracy")
# plt.legend()
# plt.show()