# **Random Forest**
- Using data that was annotated
- Training it on the unbalanced data
- Testing it on the best forming combination
- 1 second window
- Entropy is the measure of information contained in a state



In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Standard libraries
import numpy as np
import pandas as pd
import time
import os

# For audio
from IPython.display import Audio
import librosa

# For preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf

# For modeling
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


# Operational
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import scipy.ndimage
import pygame
import time
from scipy.signal import butter, filtfilt
import random

pygame 2.6.0 (SDL 2.28.4, Python 3.10.12)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [3]:
pkl_path = '/content/drive/My Drive/Final-Year-Project/Dataset/Project-V4/feature-extraction/Annotated/NotAveragePooled/split_features_1s_all.pkl'

# Load the pickle file
with open(pkl_path, 'rb') as file:
    data = pickle.load(file)

In [4]:
train_data = data['train'].copy()
val_data = data['val'].copy()

In [None]:
train_data

In [5]:
train_labels = train_data['label'].copy()
temp = train_data.copy()
del temp['label']
tr_features = temp

In [6]:
val_labels = val_data['label'].copy()
temp = val_data.copy()
del temp['label']
v_features = temp

# Shuffle Data

In [7]:
def shuffle_data(input_label, input_features):
  input_len = len(input_label)
  np.random.seed(1826)
  input_indices = np.random.permutation(input_len)
  input_features = {key: np.array([input_features[key][i] for i in input_indices]) for key in input_features} # dictionary comprehension
  input_label = np.array([input_label[i] for i in input_indices])

  return input_label, input_features

In [8]:
train_y, train_features = shuffle_data(train_labels, tr_features)

In [9]:
val_y, val_features = shuffle_data(val_labels, v_features)

In [None]:
display(train_y.shape)
display(train_y[:15])

In [None]:
for key in train_features.keys():
  display(key)
  display(train_features[key].shape)
  display(train_features[key][0])

In [None]:
display(val_y.shape)
display(val_y[:15])

In [None]:
for key in val_features.keys():
  display(key)
  display(val_features[key].shape)
  display(val_features[key][0])

## **Random Forest Model**

In [28]:
train_results = {}
val_results = {}

### **With all the features**

In [11]:
training_features_3D = np.concatenate((train_features['mfcc'], train_features['chroma'], train_features['rms'], train_features['melspectrogram']), axis=1)
training_features = training_features_3D.reshape(training_features_3D.shape[0], -1)
training_features.shape

(12565, 4092)

In [12]:
validation_features_3D = np.concatenate((val_features['mfcc'], val_features['chroma'], val_features['rms'], val_features['melspectrogram']), axis=1)
validation_features = validation_features_3D.reshape(validation_features_3D.shape[0], -1)
validation_features.shape

(3318, 4092)

Fit the model with training data with criterion = 'log_loss'... comparing it to entropy

In [29]:
rf = RandomForestClassifier(n_estimators=50, criterion='log_loss')
rf.fit(training_features, train_y)

In [30]:
print('Training accuracy:', rf.score(training_features, train_y))
train_results['log_loss_50'] = rf.score(training_features, train_y)

print('Validation accuracy:', rf.score(validation_features, val_y))
val_results['log_loss_50'] = rf.score(validation_features, val_y)

Training accuracy: 1.0
Validation accuracy: 0.6434599156118144


No noticeable improvement from using entropy to log_loss.

Fit the model with training data within criterion = 'gini'... comparing it to entropy to see accuracy improvement


In [31]:
rf = RandomForestClassifier(n_estimators=50, criterion='gini')
rf.fit(training_features, train_y)

In [32]:
print('Training accuracy:', rf.score(training_features, train_y))
train_results['gini_50'] = rf.score(training_features, train_y)

print('Validation accuracy:', rf.score(validation_features, val_y))
val_results['gini_50'] = rf.score(validation_features, val_y)

Training accuracy: 1.0
Validation accuracy: 0.6280892103676914


Accuracy decreased when using 'gini' as the criterion... so wont be using it from now.

I will try and increase the number of estimators, while using entropy criterion

In [33]:
rf = RandomForestClassifier(n_estimators=100, criterion='entropy')
rf.fit(training_features, train_y)

In [34]:
print('Training accuracy:', rf.score(training_features, train_y))
train_results['entropy_100'] = rf.score(training_features, train_y)

print('Validation accuracy:', rf.score(validation_features, val_y))
val_results['entropy_100'] = rf.score(validation_features, val_y)

Training accuracy: 1.0
Validation accuracy: 0.6633514165159735


Increasing the number of estimators from 50 to 100 increased the accuracy by 1%. Which is still a small change and I dont think is worth the extra computational complexity. So 50 will remain the number of estimators

Going to change the class_weight to automatically adjust the weights inversely proportional to class frequencies. Apparently its good with balancing out the data/results. Default is "none"

In [35]:
rf = RandomForestClassifier(n_estimators=50, criterion='entropy', class_weight='balanced')
rf.fit(training_features, train_y)

In [36]:
print('Training accuracy:', rf.score(training_features, train_y))
train_results['entropy_50_balanced'] = rf.score(training_features, train_y)

print('Validation accuracy:', rf.score(validation_features, val_y))
val_results['entropy_50_balanced'] = rf.score(validation_features, val_y)

Training accuracy: 1.0
Validation accuracy: 0.635623869801085


The use of balanced class weights decreased the accuracy, so ineffective when it comes to improving model accuracy

I also checked how using smaller number of estimators fairs and it decreases the accuracy, so will not be entertained further.

Increasing the amount of estimators really doesnt improve the model... especially since with 50, it is already overfitting. We can add new training data, or try another model. Im gonna choose the latter. On to XG

# Review results from all models

In [38]:
train_results_df = pd.DataFrame(list(train_results.items()), columns=['Parameters', 'Train_Accuracy']).round(2)
val_results_df = pd.DataFrame(list(val_results.items()), columns=['Parameters', 'Val_Accuracy']).round(2)

result_df = train_results_df.merge(val_results_df, on='Parameters')
result_df = result_df.sort_values('Parameters')
result_df

Unnamed: 0,Parameters,Train_Accuracy,Val_Accuracy
2,entropy_100,1.0,0.66
3,entropy_50_balanced,1.0,0.64
1,gini_50,1.0,0.63
0,log_loss_50,1.0,0.64



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

