# **XGBoost**
- Using data that was annotated
- Training it on the unbalanced data
- Using avgpool
- 1 second window
- Entropy is the measure of information contained in a state
- Testing it on the best perfoming combiantion of preprocessing and features



In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install xgboost



In [3]:
# Standard libraries
import numpy as np
import pandas as pd
import time
import os

# For audio
from IPython.display import Audio
import librosa

# For preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf

# For modeling
import xgboost as xgb
from sklearn.metrics import classification_report, balanced_accuracy_score, roc_auc_score, make_scorer
from sklearn.model_selection import GridSearchCV

# Operational
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import scipy.ndimage
import pygame
import time
from scipy.signal import butter, filtfilt
import random

pygame 2.6.0 (SDL 2.28.4, Python 3.10.12)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [4]:
pkl_path = '/content/drive/My Drive/Final-Year-Project/Dataset/Final-Version-of-Bird-Classification-Project/feature-extraction/Annotated/Regular/NotAveragePooled/split_features_3s_all_2D.pkl'

# Load the pickle file
with open(pkl_path, 'rb') as file:
    data = pickle.load(file)

In [5]:
train_data = data['train'].copy()
val_data = data['val'].copy()

In [6]:
train_labels = train_data['label'].copy()
temp = train_data.copy()
del temp['label']
tr_features = temp

In [7]:
val_labels = val_data['label'].copy()
temp = val_data.copy()
del temp['label']
v_features = temp

# Shuffle Data

In [8]:
def shuffle_data(input_label, input_features):
  input_len = len(input_label)
  np.random.seed(1826)
  input_indices = np.random.permutation(input_len)
  input_features = {key: np.array([input_features[key][i] for i in input_indices]) for key in input_features} # dictionary comprehension
  input_label = np.array([input_label[i] for i in input_indices])

  return input_label, input_features

In [9]:
train_y, train_features = shuffle_data(train_labels, tr_features)

In [10]:
val_y, val_features = shuffle_data(val_labels, v_features)

## **XGBoost Model**

In [11]:
train_results = {}
val_results = {}

val_scores = {}

### **With all the features**

In [12]:
training_features_3D = np.concatenate((train_features['mfcc'], train_features['chroma']), axis=1)
training_features = training_features_3D.reshape(training_features_3D.shape[0], -1)

training_features.shape

(5278, 8288)

In [13]:
validation_features_3D = np.concatenate((val_features['mfcc'], val_features['chroma']), axis=1)
validation_features = validation_features_3D.reshape(validation_features_3D.shape[0], -1)
validation_features.shape

(1350, 8288)

Fit the model with training data

In [14]:
model = xgb.XGBClassifier(booster='gbtree',
                          n_estimators = 100,
                          gamma = 0.25,
                          )
model.fit(training_features, train_y)

In [15]:
print('Training accuracy:', model.score(training_features, train_y))
train_results['gbtree_100_0.25gamma'] = model.score(training_features, train_y)

print('Validation accuracy:', model.score(validation_features, val_y))
val_results['gbtree_100_0.25gamma'] = model.score(validation_features, val_y)

Training accuracy: 1.0
Validation accuracy: 0.6881481481481482


In [16]:
val_yhat_result = model.predict(validation_features)

print('Validation classification Report \n')
print(classification_report(val_y, val_yhat_result))

Validation classification Report 

              precision    recall  f1-score   support

           0       0.74      0.83      0.78       114
           1       0.37      0.49      0.42       141
           2       0.76      0.68      0.72       271
           3       0.80      0.92      0.86        90
           4       0.67      0.11      0.19        18
           5       0.00      0.00      0.00        15
           6       0.38      0.52      0.44        23
           7       0.93      0.98      0.96        44
           8       0.73      0.70      0.71        50
           9       0.89      0.84      0.86        49
          10       0.83      0.79      0.81        48
          11       0.90      0.57      0.69        46
          12       0.80      0.96      0.87        54
          13       0.54      0.77      0.63        48
          14       0.53      0.82      0.64        38
          15       0.59      0.42      0.49       117
          16       0.94      0.91      0.93   

So the gbtree booster runs faster and gives similar results.

In [17]:
model = xgb.XGBClassifier(booster='gbtree',
                          n_estimators = 100,
                          gamma = 1,
                          )
model.fit(training_features, train_y)

In [18]:
print('Training accuracy:', model.score(training_features, train_y))
train_results['gbtree_100_1gamma'] = model.score(training_features, train_y)

print('Validation accuracy:', model.score(validation_features, val_y))
val_results['gbtree_100_1gamma'] = model.score(validation_features, val_y)

Training accuracy: 1.0
Validation accuracy: 0.6688888888888889


In [19]:
val_yhat_result = model.predict(validation_features)

print('Validation classification Report \n')
print(classification_report(val_y, val_yhat_result))

Validation classification Report 

              precision    recall  f1-score   support

           0       0.68      0.81      0.74       114
           1       0.35      0.46      0.40       141
           2       0.76      0.67      0.71       271
           3       0.80      0.93      0.86        90
           4       0.40      0.11      0.17        18
           5       0.00      0.00      0.00        15
           6       0.34      0.52      0.41        23
           7       0.93      0.98      0.96        44
           8       0.67      0.70      0.69        50
           9       0.89      0.82      0.85        49
          10       0.82      0.67      0.74        48
          11       0.93      0.54      0.68        46
          12       0.82      0.94      0.88        54
          13       0.50      0.73      0.59        48
          14       0.56      0.89      0.69        38
          15       0.53      0.36      0.43       117
          16       0.92      0.89      0.91   

So the gbtree just runs fast and gives better results. Okay. Okay. With the 1gamma too. I need to try it with the NAP results.

In [20]:
model = xgb.XGBClassifier(booster='gblinear',
                          n_estimators = 100,
                          gamma = 1,
                          )
model.fit(training_features, train_y)

Parameters: { "gamma" } are not used.



In [21]:
print('Training accuracy:', model.score(training_features, train_y))
train_results['gblinear_100_1gamma'] = model.score(training_features, train_y)

print('Validation accuracy:', model.score(validation_features, val_y))
val_results['gblinear_100_1gamma'] = model.score(validation_features, val_y)

Training accuracy: 1.0
Validation accuracy: 0.6511111111111111


In [22]:
val_yhat_result = model.predict(validation_features)

print('Validation classification Report \n')
print(classification_report(val_y, val_yhat_result))

Validation classification Report 

              precision    recall  f1-score   support

           0       0.60      0.68      0.64       114
           1       0.36      0.46      0.41       141
           2       0.73      0.65      0.68       271
           3       0.86      0.99      0.92        90
           4       0.67      0.22      0.33        18
           5       0.40      0.13      0.20        15
           6       0.19      0.26      0.22        23
           7       0.72      0.98      0.83        44
           8       0.60      0.54      0.57        50
           9       0.91      0.82      0.86        49
          10       0.91      0.85      0.88        48
          11       0.74      0.61      0.67        46
          12       0.66      0.74      0.70        54
          13       0.51      0.73      0.60        48
          14       0.65      0.89      0.76        38
          15       0.45      0.34      0.39       117
          16       0.98      0.93      0.95   

Okay. So, the gblinear model runs faster, gives worse results= Wild. Let me increase the number of estimators

In [23]:
model = xgb.XGBClassifier(booster='gbtree',
                          n_estimators = 200,
                          gamma = 1,
                          )
model.fit(training_features, train_y)

In [24]:
print('Training accuracy:', model.score(training_features, train_y))
train_results['gbtree_200_1gamma'] = model.score(training_features, train_y)

print('Validation accuracy:', model.score(validation_features, val_y))
val_results['gbtree_200_1gamma'] = model.score(validation_features, val_y)

Training accuracy: 1.0
Validation accuracy: 0.6681481481481482


In [25]:
val_yhat_result = model.predict(validation_features)

print('Validation classification Report \n')
print(classification_report(val_y, val_yhat_result))

Validation classification Report 

              precision    recall  f1-score   support

           0       0.68      0.81      0.74       114
           1       0.35      0.46      0.40       141
           2       0.75      0.67      0.71       271
           3       0.80      0.93      0.86        90
           4       0.40      0.11      0.17        18
           5       0.00      0.00      0.00        15
           6       0.34      0.52      0.41        23
           7       0.93      0.98      0.96        44
           8       0.67      0.70      0.69        50
           9       0.89      0.82      0.85        49
          10       0.82      0.67      0.74        48
          11       0.93      0.54      0.68        46
          12       0.82      0.94      0.88        54
          13       0.50      0.73      0.59        48
          14       0.56      0.89      0.69        38
          15       0.53      0.36      0.43       117
          16       0.92      0.89      0.91   

Okay, so increasing the number of estimators improved the model a bit. Noted noted. And still less overfit.
Better results for precision and stuff too.


In [26]:
model = xgb.XGBClassifier(booster='gbtree',
                          n_estimators = 100,
                          gamma = 1,
                          learning_rate = 0.25
                          )
model.fit(training_features, train_y)

In [27]:
print('Training accuracy:', model.score(training_features, train_y))
train_results['gbtree_100_1gamma_0.25learningrate'] = model.score(training_features, train_y)

print('Validation accuracy:', model.score(validation_features, val_y))
val_results['gbtree_100_1gamma_0.25learningrate'] = model.score(validation_features, val_y)

Training accuracy: 1.0
Validation accuracy: 0.6762962962962963


In [28]:
val_yhat_result = model.predict(validation_features)

print('Validation classification Report \n')
print(classification_report(val_y, val_yhat_result))

Validation classification Report 

              precision    recall  f1-score   support

           0       0.71      0.81      0.75       114
           1       0.35      0.49      0.41       141
           2       0.76      0.63      0.69       271
           3       0.81      0.94      0.87        90
           4       0.50      0.11      0.18        18
           5       0.00      0.00      0.00        15
           6       0.40      0.70      0.51        23
           7       0.91      0.98      0.95        44
           8       0.74      0.64      0.69        50
           9       0.93      0.86      0.89        49
          10       0.84      0.77      0.80        48
          11       0.89      0.54      0.68        46
          12       0.80      0.98      0.88        54
          13       0.51      0.77      0.62        48
          14       0.55      0.87      0.67        38
          15       0.54      0.37      0.44       117
          16       0.95      0.95      0.95   

Even better. Less overfit too. Increasing the learning rate worked even better too.

In [30]:
model = xgb.XGBClassifier(booster='gblinear',
                          n_estimators = 300,
                          gamma = 1,
                          learning_rate = 0.5
                          )
model.fit(training_features, train_y)

Parameters: { "gamma" } are not used.



In [31]:
print('Training accuracy:', model.score(training_features, train_y))
train_results['gbtree_100_1gamma_0.25learningrate'] = model.score(training_features, train_y)

print('Validation accuracy:', model.score(validation_features, val_y))
val_results['gbtree_100_1gamma_0.25learningrate'] = model.score(validation_features, val_y)

Training accuracy: 1.0
Validation accuracy: 0.6466666666666666


In [32]:
val_yhat_result = model.predict(validation_features)

print('Validation classification Report \n')
print(classification_report(val_y, val_yhat_result))

Validation classification Report 

              precision    recall  f1-score   support

           0       0.60      0.70      0.65       114
           1       0.36      0.44      0.40       141
           2       0.73      0.64      0.68       271
           3       0.84      0.99      0.91        90
           4       0.57      0.22      0.32        18
           5       0.38      0.20      0.26        15
           6       0.24      0.30      0.27        23
           7       0.74      0.95      0.83        44
           8       0.54      0.56      0.55        50
           9       0.87      0.84      0.85        49
          10       0.87      0.85      0.86        48
          11       0.68      0.61      0.64        46
          12       0.62      0.74      0.68        54
          13       0.51      0.73      0.60        48
          14       0.62      0.89      0.73        38
          15       0.45      0.29      0.35       117
          16       0.96      0.93      0.94   

Still a good improvement

# Review results from all models

In [29]:
train_results_df = pd.DataFrame(list(train_results.items()), columns=['Parameters', 'Train_Accuracy']).round(3)
val_results_df = pd.DataFrame(list(val_results.items()), columns=['Parameters', 'Val_Accuracy']).round(3)

result_df = train_results_df.merge(val_results_df, on='Parameters')
result_df = result_df.sort_values('Parameters')
result_df

Unnamed: 0,Parameters,Train_Accuracy,Val_Accuracy
2,gblinear_100_1gamma,1.0,0.651
0,gbtree_100_0.25gamma,1.0,0.688
1,gbtree_100_1gamma,1.0,0.669
4,gbtree_100_1gamma_0.25learningrate,1.0,0.676
3,gbtree_200_1gamma,1.0,0.668
