# Gradient Boosted Regression Trees
---

#### Data Classification Task - Model Training and Testing

### Imports
---

In [2]:
import pandas as pd
import numpy as np
import joblib
import pickle
import matplotlib.pyplot as plt

from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import precision_score, accuracy_score, recall_score

from imblearn.combine import SMOTEENN

### Load Dataset
---

In [3]:
df = pd.read_pickle('../training_dataset_task3/task_3_training_e8da4715deef7d56_f8b7378_pandas.pkl').reset_index()

X = df.loc[:, 'essentia_dissonance_mean':'mirtoolbox_roughness_pct_90']
y = df['quadrant']

# preprocess dataset
X_std = StandardScaler().fit_transform(X)
X = pd.DataFrame(X_std, columns=X.columns)

### Data Preprocessing
---
Where the ✨ magic ✨ happens

In [4]:
# add segment_id to training data for doing the cross validation splits
X["segment_id"] = df["segment_id"]

# remove segment_id 26
seg_26_indices = (X["segment_id"] == 26)
X_test = X[seg_26_indices].drop(["segment_id"], axis=1)
y_test = y[seg_26_indices]

X_train = X.drop(X[seg_26_indices].index, axis=0).reset_index(drop=True)
y_train = y.drop(X[seg_26_indices].index, axis=0)


# https://imbalanced-learn.org/stable/combine.html
smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)

In [5]:
# split the data according to segment_id
cv = []
for i in range(24):
    train_indices = X_resampled[~X_resampled["segment_id"].isin([i, i + 1])].index.to_list()
    test_indices = X_resampled[X_resampled["segment_id"].isin([i, i + 1])].index.to_list()
    cv.append((train_indices, test_indices))

# remove the segment_id as we don't want it in the training data
X_resampled = X_resampled.drop(["segment_id"], axis=1)

### Model Training and Testing
---

In [6]:
def evaluate_model(model, train_set_x, train_set_y, test_set_x, test_set_y):
    """

    :param model:
    :return:
    """
    train_yhat = model.predict(train_set_x)
    test_yhat = model.predict(test_set_x)

    errors = abs(test_yhat - y_test)

    print('Model Performance Check:')
    print("***"*3)
    print(' Average Error: {:0.4f}'.format(np.mean(errors)))
    print(' Precission: {:0.4f}'.format(precision_score(test_set_y, test_yhat, average='micro')))
    print(' Recall: {:0.4f}'.format(recall_score(test_set_y, test_yhat, average='micro')))


    print("\nModel Accuracy Check:")
    print("***"*3)
    print(" Trainset Accuracy: {:0.4f}".format(accuracy_score(train_set_y, train_yhat)))
    print(" Testset Accuracy: {:0.4f}".format(accuracy_score(test_set_y, test_yhat)))

def export_model(model, model_name: str):
    """
    Exports a given classifier to a pickel file
    :param model:
    :param model_name:
    :return: None
    """
    if model is not None:
        try:
            joblib.dump(model, '{}.pkl'.format(model_name), compress=9)
            print("Successfully exported classifier!\n Location: {}")
        except Exception as ex:
            raise ex
    else:
        raise ValueError("Model is none")

In [15]:
X_resampled

Unnamed: 0,essentia_dissonance_mean,essentia_dissonance_stdev,essentia_dynamic_complexity,essentia_loudness,essentia_onset_rate,essentia_pitch_salience_mean,essentia_pitch_salience_stdev,essentia_spectral_centroid_mean,essentia_spectral_centroid_stdev,essentia_spectral_complexity_mean,...,mirtoolbox_novelty_std,mirtoolbox_novelty_pct_10,mirtoolbox_novelty_pct_50,mirtoolbox_novelty_pct_90,mirtoolbox_pulseclarity,mirtoolbox_roughness_mean,mirtoolbox_roughness_std,mirtoolbox_roughness_pct_10,mirtoolbox_roughness_pct_50,mirtoolbox_roughness_pct_90
0,0.090946,-1.049143,-0.773709,0.964218,-0.418962,0.280063,-0.386888,-0.379471,-0.649024,-0.687507,...,0.455788,-0.132523,-0.733623,-0.059085,-0.918228,-0.034804,-0.741760,0.294055,-0.018662,-0.227726
1,0.270427,-0.674596,-0.673356,0.982513,-0.309247,0.018125,-0.097964,0.246241,-0.265885,-1.205762,...,-0.986785,-0.132523,-1.021422,-1.318619,-1.121356,-0.334504,-0.000192,-0.238357,-0.422630,-0.189874
2,-0.019990,-1.521880,-0.537949,0.080994,-1.186966,-0.454266,-0.329631,-0.161461,-0.492992,-0.132790,...,1.972376,-0.132523,2.069332,2.463840,0.047504,-1.288435,-1.084122,-0.978312,-1.275097,-1.334440
3,0.744962,0.594004,2.790197,-0.414494,0.129612,0.353962,-0.026213,1.472166,1.399065,1.229262,...,-0.355290,-0.132523,-0.000947,-0.207538,0.691361,-0.503286,0.362072,-0.577971,-0.548347,-0.452646
4,0.269103,0.974086,1.176281,-0.516288,-0.199533,0.938598,-0.167299,0.882138,1.191004,1.599073,...,0.004257,-0.132523,-0.408126,-0.250454,-0.731234,0.047901,0.930058,-0.490207,0.195989,0.154293
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,-1.403430,-0.025506,0.776422,-0.792378,-0.309247,-0.386267,0.963335,-0.736275,-0.189887,0.122432,...,-0.434882,-0.132523,-1.045509,-0.558994,0.315030,-1.083367,-0.586390,-0.824407,-1.203187,-1.031429
995,-1.975285,-0.329677,0.347851,-0.766778,0.019897,-1.185301,2.205495,-0.852301,-0.200213,-0.783865,...,-0.676836,-0.132523,-0.528553,-0.976388,-0.362398,-1.146172,-0.477209,-1.027908,-1.200254,-1.004156
996,-1.507881,-0.340120,0.156639,-0.710164,-0.309247,0.292024,2.396645,-0.984551,-0.542357,0.182332,...,-0.879013,0.104492,-1.057542,-1.586059,-0.864682,0.145945,1.486274,-0.669291,0.215161,0.533152
997,-1.655550,-0.729454,-0.474019,-0.707023,-0.748107,1.263356,0.874696,-1.151020,-0.584318,0.739652,...,-1.418508,0.117940,-0.990264,-1.443719,-0.297005,1.205389,1.194803,0.903250,1.206546,1.418579


In [71]:
# Gradient Boosted Regression Tree
# gbrt_model_cv = GradientBoostingClassifier(
#    n_estimators=100,
#    learning_rate=0.1,
#    max_depth=10,
#    min_samples_leaf=2,
#    random_state=0).fit(X_train, y_train)

# gbrt_model.fit(X_train, y_train)

params = {"n_estimators": [100],
          "learning_rate": [0.1],
          "min_samples_leaf": [10],
          "max_depth": [10]}

gbdt_cv = GridSearchCV(GradientBoostingClassifier(), params, cv=cv, n_jobs=-1)
gbdt_cv.fit(X_resampled, y_resampled)

In [72]:
print("Best Model Parameters: \n {}\n Best Score: {}\n Trainset Score: {}\n Testset Score: {}".format(gbdt_cv.best_params_, gbdt_cv.best_score_, gbdt_cv.score(X_resampled, y_resampled), gbdt_cv.score(X_test, y_test)))

Best Model Parameters: 
 {'learning_rate': 0.1, 'max_depth': 10, 'min_samples_leaf': 10, 'n_estimators': 100}
 Best Score: 0.5314454479085674
 Trainset Score: 1.0
 Testset Score: 0.27906976744186046


### GBDT Model Evaluation
---

In [73]:
params = gbdt_cv.best_params_
gbdt = GradientBoostingClassifier(**params)
gbdt.fit(X_resampled, y_resampled)

In [74]:
evaluate_model(gbdt, X_resampled, y_resampled, X_test, y_test)

Model Performance Check:
*********
 Average Error: 1.4767
 Precission: 0.2674
 Recall: 0.2674

Model Accuracy Check:
*********
 Trainset Accuracy: 1.0000
 Testset Accuracy: 0.2674
