# **XGBoost**
- Using data that was annotated
- Training it on the unbalanced data
- Using avgpool
- 1 second window
- Entropy is the measure of information contained in a state
- Testing it on the best perfoming combiantion of preprocessing and features



In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install xgboost



In [3]:
# Standard libraries
import numpy as np
import pandas as pd
import time
import os

# For audio
from IPython.display import Audio
import librosa

# For preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf

# For modeling
import xgboost as xgb
from sklearn.metrics import classification_report, balanced_accuracy_score, roc_auc_score, make_scorer
from sklearn.model_selection import GridSearchCV

# Operational
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import scipy.ndimage
import pygame
import time
from scipy.signal import butter, filtfilt
import random

pygame 2.6.1 (SDL 2.28.4, Python 3.10.12)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [4]:
pkl_path = '/content/drive/My Drive/Final-Year-Project/Dataset/Project-V4/feature-extraction/Annotated/AveragePooled/split_features_1s_all.pkl'

# Load the pickle file
with open(pkl_path, 'rb') as file:
    data = pickle.load(file)

In [5]:
train_data = data['train'].copy()
val_data = data['val'].copy()

In [6]:
train_labels = train_data['label'].copy()
temp = train_data.copy()
del temp['label']
tr_features = temp

In [7]:
val_labels = val_data['label'].copy()
temp = val_data.copy()
del temp['label']
v_features = temp

# Shuffle Data

In [8]:
def shuffle_data(input_label, input_features):
  input_len = len(input_label)
  np.random.seed(1826)
  input_indices = np.random.permutation(input_len)
  input_features = {key: np.array([input_features[key][i] for i in input_indices]) for key in input_features} # dictionary comprehension
  input_label = np.array([input_label[i] for i in input_indices])

  return input_label, input_features

In [9]:
train_y, train_features = shuffle_data(train_labels, tr_features)

In [10]:
val_y, val_features = shuffle_data(val_labels, v_features)

In [11]:
display(train_y.shape)
display(train_y[:15])

(12565,)

array([ 0, 19,  2,  1, 11, 17, 15,  2,  2,  2,  0,  1,  1, 12,  0],
      dtype=int32)

In [12]:
for key in train_features.keys():
  display(key)
  display(train_features[key].shape)
  display(train_features[key][0])

'melspectrogram'

(12565, 60)

array([5.0738169e-07, 5.2262533e-07, 5.5045160e-07, 5.9217962e-07,
       6.5191085e-07, 1.0943645e-06, 4.3586356e-06, 1.9036837e-05,
       1.2136879e-04, 2.6661658e-04, 8.2672836e-04, 2.7233763e-03,
       1.0063773e-02, 1.6930899e-02, 2.1348633e-02, 4.7072649e-02,
       7.6561354e-02, 4.7709879e-02, 6.2269606e-02, 7.6247416e-02,
       6.1358120e-02, 3.9835755e-02, 4.4739708e-02, 5.0686374e-02,
       4.4587012e-02, 5.6312285e-02, 4.7983538e-02, 5.3998958e-02,
       9.1156095e-02, 1.2960179e-01, 1.3472772e-01, 7.4508280e-02,
       2.8677055e-01, 4.0667152e-01, 7.1514398e-01, 6.9495946e-01,
       3.1333745e-01, 2.0148984e-01, 1.1326054e-01, 6.4187691e-02,
       7.1493596e-02, 8.1204779e-02, 7.1447775e-02, 5.0326228e-02,
       3.6310278e-02, 2.3586553e-02, 1.3296669e-02, 4.2282213e-03,
       3.2108359e-03, 4.2909747e-03, 3.0864956e-02, 6.0674440e-02,
       5.0374899e-02, 3.0964591e-02, 4.0448662e-03, 3.0534746e-04,
       3.1673382e-05, 3.8426388e-06, 2.3940433e-06, 2.1029000e

'mfcc'

(12565, 20)

array([-1.        , -0.16236389, -0.4283241 ,  0.04256841, -0.05486935,
        0.01894294,  0.00423389,  0.10348184, -0.03871864,  0.0294642 ,
       -0.02652909,  0.01274641, -0.01874611, -0.00494358,  0.02157117,
       -0.01889195,  0.00154897, -0.01315036,  0.00566544, -0.00682729],
      dtype=float32)

'chroma'

(12565, 12)

array([0.18776874, 0.15328921, 0.33524922, 0.454852  , 0.7760426 ,
       0.67847323, 0.3407292 , 0.24142714, 0.18064852, 0.15764236,
       0.18976806, 0.2140399 ], dtype=float32)

'rms'

(12565, 1)

array([0.0295004], dtype=float32)

In [13]:
display(val_y.shape)
display(val_y[:15])

(3318,)

array([ 3,  2,  1,  0,  0,  8, 19,  2, 19,  5, 19, 15, 11,  7,  2],
      dtype=int32)

In [14]:
for key in val_features.keys():
  display(key)
  display(val_features[key].shape)
  display(val_features[key][0])

'melspectrogram'

(3318, 60)

array([1.8282815e-04, 1.8516471e-04, 1.8923497e-04, 1.9535325e-04,
       2.0405794e-04, 2.1621672e-04, 2.3355127e-04, 2.6386356e-04,
       3.8294177e-04, 8.7105710e-04, 4.0952233e-03, 7.9782819e-03,
       1.2021556e-02, 1.1450265e-02, 1.8598879e-02, 4.5423362e-02,
       5.1051736e-02, 2.1339525e-02, 2.7392777e-02, 2.3795826e-02,
       1.3348093e-02, 1.3525566e-02, 8.1301248e-03, 8.5273301e-03,
       1.1604097e-02, 7.5979652e-03, 4.1982550e-03, 4.9553681e-03,
       6.7672539e-03, 8.9110127e-03, 1.0491555e-02, 8.9149661e-03,
       8.2060061e-03, 6.9327699e-03, 7.8629563e-03, 8.1582973e-03,
       7.1896906e-03, 3.3999698e-03, 3.0463785e-03, 5.0475681e-03,
       5.0008059e-03, 4.5382837e-03, 4.0109674e-03, 4.0369830e-03,
       9.5070582e-03, 1.8515797e-02, 5.2194532e-02, 2.4449244e-01,
       2.7477503e-01, 5.4956347e-01, 6.2898314e-01, 4.4136560e-01,
       5.3704370e-02, 2.4117595e-02, 1.1598948e-02, 4.3818848e-03,
       2.8213693e-04, 6.7681176e-06, 2.1489736e-06, 2.1393287e

'mfcc'

(3318, 20)

array([-1.0000000e+00, -1.0882675e-01, -1.4413263e-01, -6.7996904e-02,
       -1.5429229e-01,  7.5967886e-02, -1.1015258e-01,  1.3946199e-01,
       -3.3290099e-02,  4.8340354e-02,  2.1415181e-02, -1.9456839e-02,
        2.5275797e-02, -2.4598164e-02, -8.1183799e-03,  6.0081406e-04,
       -2.5172167e-02,  8.5308263e-03, -9.6246637e-03,  7.1927463e-04],
      dtype=float32)

'chroma'

(3318, 12)

array([0.02229075, 0.02539484, 0.03112359, 0.1376025 , 0.2878612 ,
       0.39549676, 0.5717799 , 0.5721824 , 0.20036055, 0.06311192,
       0.03563747, 0.02806006], dtype=float32)

'rms'

(3318, 1)

array([0.03920381], dtype=float32)

## **XGBoost Model**

In [23]:
train_results = {}
val_results = {}

### **With all the features**

In [16]:
training_features_3D = np.concatenate((train_features['mfcc'], train_features['chroma'], train_features['rms'], train_features['melspectrogram']), axis=1)
# A random forest model expect a 2D input of (n_samples, n_features). Since the input is 3D here, it will cause an error when passed through a RF,
# I need to flatten the training features from 3D to 2D... eg. (7105, 93 * 259)
training_features = training_features_3D.reshape(training_features_3D.shape[0], -1)

training_features.shape

(12565, 93)

In [17]:
validation_features_3D = np.concatenate((val_features['mfcc'], val_features['chroma'], val_features['rms'], val_features['melspectrogram']), axis=1)
validation_features = validation_features_3D.reshape(validation_features_3D.shape[0], -1)
validation_features.shape

(3318, 93)

Fit the model with training data

In [19]:
# Imma research on how to do this properly.

# I need to read the manual. They say i gotta check out the scale_pos_weight paramter if i have imbalanced data and to use AUC for evaluation
# model = xgb.XGBClassifier(booster='dart')

# param_grid = {'max_depth': [2, 4, 6],
#               "n_estimators": [50, 100, 200],
#               'learning_rate': [0.1, 0.01, 0.05],
#               'gamma': [0, 0.25, 1],
#               'scale_pos_weight': [1, 3, 5]}

# clf = GridSearchCV(
#         model,
#         param_grid,
#         verbose=0,
#         n_jobs=2,
#         cv=5,
#         scoring = 'roc_auc'
#     )

# clf.fit(training_features, train_y)
# display(clf.best_score_)
# display(clf.best_params_)



KeyboardInterrupt: 

In [19]:
model = xgb.XGBClassifier(booster='dart',
                          n_estimators = 50,
                          gamma = 0.25,
                          scale_pos_weight=3
                          )
model.fit(training_features, train_y)

Parameters: { "scale_pos_weight" } are not used.



In [24]:
print('Training accuracy:', model.score(training_features, train_y))
train_results['50_0.25gamma_scalepos3'] = model.score(training_features, train_y)

print('Validation accuracy:', model.score(validation_features, val_y))
val_results['50_0.25gamma_scalepos3'] = model.score(validation_features, val_y)

Training accuracy: 1.0
Validation accuracy: 0.604882459312839


In [21]:
train_yhat_result = model.predict(training_features)

print('Training classification Report \n')
print(classification_report(train_y, train_yhat_result))

Training classification Report 

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2060
           1       1.00      1.00      1.00      2477
           2       1.00      1.00      1.00      2203
           3       1.00      1.00      1.00       437
           4       1.00      1.00      1.00       218
           5       1.00      1.00      1.00       387
           6       1.00      1.00      1.00       167
           7       1.00      1.00      1.00       263
           8       1.00      1.00      1.00       196
           9       1.00      1.00      1.00       443
          10       1.00      1.00      1.00       191
          11       1.00      1.00      1.00       211
          12       1.00      1.00      1.00       378
          13       1.00      1.00      1.00       232
          14       1.00      1.00      1.00       188
          15       1.00      1.00      1.00      1133
          16       1.00      1.00      1.00     

In [22]:
val_yhat_result = model.predict(validation_features)

print('Validation classification Report \n')
print(classification_report(val_y, val_yhat_result))

Validation classification Report 

              precision    recall  f1-score   support

           0       0.67      0.77      0.72       455
           1       0.36      0.64      0.46       492
           2       0.69      0.39      0.50       889
           3       0.83      1.00      0.91       150
           4       0.73      0.12      0.21        67
           5       0.39      0.58      0.47        43
           6       0.32      0.25      0.28        24
           7       0.66      0.70      0.68        44
           8       0.37      0.68      0.48        50
           9       0.89      0.92      0.91       169
          10       0.63      0.64      0.64        53
          11       0.50      0.27      0.35        66
          12       0.81      0.73      0.77        59
          13       0.68      0.44      0.53        57
          14       0.45      0.76      0.56        38
          15       0.67      0.59      0.62       311
          16       0.93      0.75      0.83   

In [25]:
model = xgb.XGBClassifier(booster='gbtree',
                          n_estimators = 100,
                          gamma = 0.25,
                          )
model.fit(training_features, train_y)

In [26]:
print('Training accuracy:', model.score(training_features, train_y))
train_results['gbtree_50_0.25gamma'] = model.score(training_features, train_y)

print('Validation accuracy:', model.score(validation_features, val_y))
val_results['gbtree_50_0.25gamma'] = model.score(validation_features, val_y)

Training accuracy: 1.0
Validation accuracy: 0.6078963230861965


So the gbtree booster runs faster and gives similar results.

In [27]:
model = xgb.XGBClassifier(booster='gbtree',
                          n_estimators = 100,
                          gamma = 1,
                          )
model.fit(training_features, train_y)

In [28]:
print('Training accuracy:', model.score(training_features, train_y))
train_results['gbtree_50_1gamma'] = model.score(training_features, train_y)

print('Validation accuracy:', model.score(validation_features, val_y))
val_results['gbtree_50_1gamma'] = model.score(validation_features, val_y)

Training accuracy: 0.9988857938718663
Validation accuracy: 0.6151295961422544


So the gbtree just runs fast and gives better results. Okay. Okay. With the 1gamma too. I need to try it with the NAP results.

In [29]:
model = xgb.XGBClassifier(booster='gblinear',
                          n_estimators = 100,
                          gamma = 1,
                          )
model.fit(training_features, train_y)

Parameters: { "gamma" } are not used.



In [30]:
print('Training accuracy:', model.score(training_features, train_y))
train_results['gblinear_100_1gamma'] = model.score(training_features, train_y)

print('Validation accuracy:', model.score(validation_features, val_y))
val_results['gblinear_100_1gamma'] = model.score(validation_features, val_y)

Training accuracy: 0.8292877039395146
Validation accuracy: 0.6582278481012658


Okay. So, the gblinear model runs faster, gives better results, and also less overfitting. Wild. Let me increase the number of estimators

In [31]:
model = xgb.XGBClassifier(booster='gblinear',
                          n_estimators = 200,
                          gamma = 1,
                          )
model.fit(training_features, train_y)

Parameters: { "gamma" } are not used.



In [32]:
print('Training accuracy:', model.score(training_features, train_y))
train_results['gblinear_200_1gamma'] = model.score(training_features, train_y)

print('Validation accuracy:', model.score(validation_features, val_y))
val_results['gblinear_200_1gamma'] = model.score(validation_features, val_y)

Training accuracy: 0.843533625149224
Validation accuracy: 0.6609403254972875


In [33]:
val_yhat_result = model.predict(validation_features)

print('Validation classification Report \n')
print(classification_report(val_y, val_yhat_result))

Validation classification Report 

              precision    recall  f1-score   support

           0       0.71      0.77      0.74       455
           1       0.55      0.52      0.54       492
           2       0.76      0.74      0.75       889
           3       0.86      1.00      0.93       150
           4       0.37      0.16      0.23        67
           5       0.36      0.70      0.47        43
           6       0.19      0.42      0.26        24
           7       0.55      0.68      0.61        44
           8       0.42      0.50      0.45        50
           9       0.89      0.88      0.89       169
          10       0.26      0.38      0.31        53
          11       0.49      0.47      0.48        66
          12       0.64      0.42      0.51        59
          13       0.60      0.54      0.57        57
          14       0.27      0.42      0.33        38
          15       0.65      0.52      0.58       311
          16       0.92      0.65      0.77   

Okay, so increasing the number of estimators improved the model a bit. Noted noted. And still less overfit.
Better results for precision and stuff too.


In [34]:
model = xgb.XGBClassifier(booster='gblinear',
                          n_estimators = 300,
                          gamma = 1,
                          learning_rate = 0.25
                          )
model.fit(training_features, train_y)

Parameters: { "gamma" } are not used.



In [35]:
print('Training accuracy:', model.score(training_features, train_y))
train_results['gblinear_300_1gamma_0.25learningrate'] = model.score(training_features, train_y)

print('Validation accuracy:', model.score(validation_features, val_y))
val_results['gblinear_300_1gamma_0.25learningrate'] = model.score(validation_features, val_y)

Training accuracy: 0.8390768006366892
Validation accuracy: 0.6630500301386377


In [36]:
val_yhat_result = model.predict(validation_features)

print('Validation classification Report \n')
print(classification_report(val_y, val_yhat_result))

Validation classification Report 

              precision    recall  f1-score   support

           0       0.71      0.78      0.74       455
           1       0.56      0.53      0.55       492
           2       0.76      0.75      0.75       889
           3       0.87      1.00      0.93       150
           4       0.43      0.18      0.25        67
           5       0.33      0.67      0.45        43
           6       0.21      0.42      0.28        24
           7       0.53      0.64      0.58        44
           8       0.38      0.50      0.43        50
           9       0.89      0.89      0.89       169
          10       0.27      0.38      0.31        53
          11       0.50      0.48      0.49        66
          12       0.68      0.46      0.55        59
          13       0.60      0.51      0.55        57
          14       0.25      0.37      0.30        38
          15       0.64      0.52      0.57       311
          16       0.90      0.65      0.76   

Even better. Less overfit too. Increasing the learning rate worked even better too.

In [37]:
model = xgb.XGBClassifier(booster='gblinear',
                          n_estimators = 300,
                          gamma = 1,
                          learning_rate = 0.5
                          )
model.fit(training_features, train_y)

Parameters: { "gamma" } are not used.



In [38]:
print('Training accuracy:', model.score(training_features, train_y))
train_results['gblinear_300_1gamma_0.5learningrate'] = model.score(training_features, train_y)

print('Validation accuracy:', model.score(validation_features, val_y))
val_results['gblinear_300_1gamma_0.5learningrate'] = model.score(validation_features, val_y)

Training accuracy: 0.8483883804218066
Validation accuracy: 0.6612417118746232


In [39]:
val_yhat_result = model.predict(validation_features)

print('Validation classification Report \n')
print(classification_report(val_y, val_yhat_result))

Validation classification Report 

              precision    recall  f1-score   support

           0       0.71      0.77      0.74       455
           1       0.55      0.52      0.53       492
           2       0.75      0.74      0.75       889
           3       0.86      1.00      0.93       150
           4       0.34      0.16      0.22        67
           5       0.36      0.72      0.48        43
           6       0.19      0.42      0.26        24
           7       0.55      0.68      0.61        44
           8       0.39      0.50      0.44        50
           9       0.89      0.89      0.89       169
          10       0.23      0.32      0.27        53
          11       0.51      0.47      0.49        66
          12       0.61      0.39      0.47        59
          13       0.61      0.54      0.57        57
          14       0.29      0.45      0.35        38
          15       0.65      0.53      0.58       311
          16       0.93      0.69      0.79   

Still a good improvement

In [40]:
model = xgb.XGBClassifier(booster='gblinear',
                          n_estimators = 400,
                          gamma = 1,
                          learning_rate = 0.3
                          )
model.fit(training_features, train_y)

Parameters: { "gamma" } are not used.



In [41]:
print('Training accuracy:', model.score(training_features, train_y))
train_results['gblinear_400_1gamma_0.3learningrate'] = model.score(training_features, train_y)

print('Validation accuracy:', model.score(validation_features, val_y))
val_results['gblinear_400_1gamma_0.3learningrate'] = model.score(validation_features, val_y)

Training accuracy: 0.8456824512534818
Validation accuracy: 0.661543098251959


In [42]:
val_yhat_result = model.predict(validation_features)

print('Validation classification Report \n')
print(classification_report(val_y, val_yhat_result))

Validation classification Report 

              precision    recall  f1-score   support

           0       0.72      0.78      0.75       455
           1       0.55      0.53      0.54       492
           2       0.76      0.74      0.75       889
           3       0.86      1.00      0.93       150
           4       0.37      0.16      0.23        67
           5       0.36      0.70      0.47        43
           6       0.19      0.42      0.26        24
           7       0.53      0.66      0.59        44
           8       0.38      0.50      0.43        50
           9       0.89      0.89      0.89       169
          10       0.23      0.32      0.27        53
          11       0.49      0.47      0.48        66
          12       0.66      0.42      0.52        59
          13       0.62      0.54      0.58        57
          14       0.27      0.42      0.33        38
          15       0.65      0.52      0.58       311
          16       0.92      0.65      0.77   

# Review results from all models

In [44]:
train_results_df = pd.DataFrame(list(train_results.items()), columns=['Parameters', 'Train_Accuracy']).round(3)
val_results_df = pd.DataFrame(list(val_results.items()), columns=['Parameters', 'Val_Accuracy']).round(3)

result_df = train_results_df.merge(val_results_df, on='Parameters')
result_df = result_df.sort_values('Parameters')
result_df

Unnamed: 0,Parameters,Train_Accuracy,Val_Accuracy
0,50_0.25gamma_scalepos3,1.0,0.605
3,gblinear_100_1gamma,0.829,0.658
4,gblinear_200_1gamma,0.844,0.661
5,gblinear_300_1gamma_0.25learningrate,0.839,0.663
6,gblinear_300_1gamma_0.5learningrate,0.848,0.661
7,gblinear_400_1gamma_0.3learningrate,0.846,0.662
1,gbtree_50_0.25gamma,1.0,0.608
2,gbtree_50_1gamma,0.999,0.615
