# Gradient Boosted Regression Trees
---

#### Data Classification Task - Model Training and Testing

### Imports
---

In [52]:
import pandas as pd
import numpy as np
import joblib
import pickle
import matplotlib.pyplot as plt

from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import precision_score, accuracy_score, recall_score

from imblearn.combine import SMOTEENN

### Load Dataset
---

In [53]:
df = pd.read_pickle('../training_dataset_task3/task_3_training_e8da4715deef7d56_f8b7378_pandas.pkl').reset_index()

X = df.loc[:, 'essentia_dissonance_mean':'gemmes_flow_binary']
y = df['quadrant']

### Data Preprocessing
---
Where the ✨ magic ✨ happens

In [54]:
# add segment_id to training data for doing the cross validation splits
X["segment_id"] = df["segment_id"]

# remove segment_id 26
seg_26_indices = (X["segment_id"] == 26)
X_test = X[seg_26_indices].drop(["segment_id"], axis=1)
y_test = y[seg_26_indices]

X_train = X.drop(X[seg_26_indices].index, axis=0).reset_index(drop=True)
y_train = y.drop(X[seg_26_indices].index, axis=0)


# https://imbalanced-learn.org/stable/combine.html
smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)

# preprocess dataset
X_std = StandardScaler().fit_transform(X)
X = pd.DataFrame(X_std, columns=X.columns)

In [55]:
# split the data according to segment_id
cv = []
for i in range(24):
    train_indices = X_resampled[~X_resampled["segment_id"].isin([i, i + 1])].index.to_list()
    test_indices = X_resampled[X_resampled["segment_id"].isin([i, i + 1])].index.to_list()
    cv.append((train_indices, test_indices))

# remove the segment_id as we don't want it in the training data
X_resampled = X_resampled.drop(["segment_id"], axis=1)

### Model Training and Testing
---

In [56]:
def evaluate_model(model, train_set_x, train_set_y, test_set_x, test_set_y):
    """

    :param model:
    :return:
    """
    train_yhat = model.predict(train_set_x)
    test_yhat = model.predict(test_set_x)

    errors = abs(test_yhat - y_test)

    print('Model Performance Check:')
    print("***"*3)
    print(' Average Error: {:0.4f}'.format(np.mean(errors)))
    print(' Precission: {:0.4f}'.format(precision_score(test_set_y, test_yhat, average='micro')))
    print(' Recall: {:0.4f}'.format(recall_score(test_set_y, test_yhat, average='micro')))


    print("\nModel Accuracy Check:")
    print("***"*3)
    print(" Trainset Accuracy: {:0.4f}".format(accuracy_score(train_set_y, train_yhat)))
    print(" Testset Accuracy: {:0.4f}".format(accuracy_score(test_set_y, test_yhat)))

def export_model(model, model_name: str):
    """
    Exports a given classifier to a pickel file
    :param model:
    :param model_name:
    :return: None
    """
    if model is not None:
        try:
            joblib.dump(model, '{}.pkl'.format(model_name), compress=9)
            print("Successfully exported classifier!\n Location: {}")
        except Exception as ex:
            raise ex
    else:
        raise ValueError("Model is none")

In [57]:
# Gradient Boosted Regression Tree
# gbrt_model_cv = GradientBoostingClassifier(
#    n_estimators=100,
#    learning_rate=0.1,
#    max_depth=10,
#    min_samples_leaf=2,
#    random_state=0).fit(X_train, y_train)

# gbrt_model.fit(X_train, y_train)

params = {"n_estimators": [21],
          "learning_rate": [0.16,0.15,0.12],
          "min_samples_leaf": [110,105],
          "max_depth": [15],
          "criterion": ["friedman_mse", "squared_error"]}

gbdt_cv = GridSearchCV(GradientBoostingClassifier(), params, cv=cv, n_jobs=-1)
gbdt_cv.fit(X_resampled, y_resampled)

In [58]:
print("Best Model Parameters: \n {}\n Best Score: {}\n Trainset Score: {}\n Testset Score: {}".format(gbdt_cv.best_params_, gbdt_cv.best_score_, gbdt_cv.score(X_resampled, y_resampled), gbdt_cv.score(X_test, y_test)))

Best Model Parameters: 
 {'criterion': 'squared_error', 'learning_rate': 0.16, 'max_depth': 15, 'min_samples_leaf': 105, 'n_estimators': 21}
 Best Score: 0.8939130932826108
 Trainset Score: 0.9629629629629629
 Testset Score: 0.9767441860465116


### GBDT Model Evaluation
---

In [59]:
params = gbdt_cv.best_params_
gbdt = GradientBoostingClassifier(**params)
gbdt.fit(X_resampled, y_resampled)

In [60]:
evaluate_model(gbdt, X_resampled, y_resampled, X_test, y_test)

Model Performance Check:
*********
 Average Error: 0.0233
 Precission: 0.9767
 Recall: 0.9767

Model Accuracy Check:
*********
 Trainset Accuracy: 0.9630
 Testset Accuracy: 0.9767
