<a href="https://colab.research.google.com/github/Sam-Kon/FactCheckExtension/blob/main/ICM_Tonic_Notebooks/Librosa_MFCC/Models_Librosa_MFCCs_100_Components_(UMAP)_includesFindingBestNComponents.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Set-up
imports, mount google drive

In [19]:
# connect to Google Drive
from google.colab import drive
drive.mount('/content/drive')
from google.colab import auth
auth.authenticate_user()
import gspread
from google.auth import default
creds, _ = default()
gc = gspread.authorize(creds)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
# imports
import numpy as np
import pandas as pd
import librosa as lb
import librosa.display
import matplotlib.pyplot as plt
import seaborn as sns
import joblib # to export model

In [21]:
def getListIndex(nrow, ncol,row_pos, col_pos):
    list_pos = row_pos*ncol + col_pos
    return(list_pos)

In [22]:
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

# Import MFCC data + tonic annotations

In [23]:
worksheet = gc.open('AAR C Data').sheet1

# get_all_values gives a list of rows.
dataset = worksheet.get_all_values()

dataset = pd.DataFrame.from_records(dataset)

In [24]:
label = dataset[dataset. columns[0]]
label.value_counts()

C#3 Scale (1.5 Kattai)    247
D3 Scale (2 Kattai)       236
D#3 Scale (2.5 Kattai)    189
C3 Scale (1 Kattai)       183
E3 Scale (3 Kattai)       124
F3 Scale (4 Kattai)        91
G#3 Scale (5.5 Kattai)     90
G3 Scale (5 Kattai)        89
A#3 Scale (6.5 Kattai)     81
F#3 Scale (4.5 Kattai)     62
B2 Scale (7 Kattai)        59
A3 Scale (6 Kattai)        57
A2 Scale (6 Kattai)        32
E4 Scale (3 Kattai)        21
D#4 Scale (2.5 Kattai)     15
C4 Scale (1 Kattai)        15
B3 Scale (7 Kattai)        12
C#4 Scale (1.5 Kattai)     12
D4 Scale (2 Kattai)        10
A#2 Scale (6.5 Kattai)      9
Name: 0, dtype: int64

# UMAP

format dataset

In [25]:
dataset = dataset.drop(labels = dataset.columns[0],axis = 1)

In [26]:
dataset = dataset.transpose()
dataset.reset_index(inplace=True, drop=True)
dataset = dataset.transpose()

Import UMAP and create a labels list with numerical labels instead of strings

In [None]:
!pip install umap-learn

In [28]:
from umap import UMAP

In [29]:
labToNum = {"A2 Scale (6 Kattai)":0,
            "A#2 Scale (6.5 Kattai)":1,
            "B2 Scale (7 Kattai)":2,
            "C3 Scale (1 Kattai)":3,
            "C#3 Scale (1.5 Kattai)":4,
            "D3 Scale (2 Kattai)":5,
            "D#3 Scale (2.5 Kattai)":6,
            "E3 Scale (3 Kattai)":7,
            "F3 Scale (4 Kattai)":8,
            "F#3 Scale (4.5 Kattai)":9,
            "G3 Scale (5 Kattai)":10,
            "G#3 Scale (5.5 Kattai)":11,
            "A3 Scale (6 Kattai)":12,
            "A#3 Scale (6.5 Kattai)":13,
            "B3 Scale (7 Kattai)":14,
            "C4 Scale (1 Kattai)":15,
            "C#4 Scale (1.5 Kattai)":16,
            "D4 Scale (2 Kattai)":17,
            "D#4 Scale (2.5 Kattai)":18,
            "E4 Scale (3 Kattai)":19}

In [None]:
numLabel = pd.Series()
for lab in label:
  numLabel = numLabel.append(pd.Series([labToNum.get(lab)]))

In [31]:
numLabel.reset_index(drop=True)

0        7
1        7
2        6
3        7
4        6
        ..
1629    12
1630     3
1631     4
1632     3
1633    10
Length: 1634, dtype: int64

Use GridSearch to determine best n_components for UMAP

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
# Split the dataset into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(
    dataset, numLabel, test_size=0.3, random_state=42
)

# Classification with a linear SVM
svc = LinearSVC(dual=False, random_state=123)
params_grid = {"C": [10 ** k for k in range(-2, 3)]}
clf = GridSearchCV(svc, params_grid)
clf.fit(X_train, y_train)
print(
    "Accuracy on the test set with raw data: {:.3f}".format(clf.score(X_test, y_test))
)

# Transformation with UMAP followed by classification with a linear SVM
umap = UMAP(random_state=456)
pipeline = Pipeline([("umap", umap), ("svc", svc)])
params_grid_pipeline = {
    "umap__n_neighbors": [20, 100, 200, 500],
    "umap__n_components": [20, 100, 200, 500],
    "svc__C": [10 ** k for k in range(-3, 4)],
}

clf_pipeline = GridSearchCV(pipeline, params_grid_pipeline)
clf_pipeline.fit(X_train, y_train)
print(
    "Accuracy on the test set with UMAP transformation: {:.3f}".format(
        clf_pipeline.score(X_test, y_test)
    )
)

In [33]:
print(clf_pipeline.best_params_)

{'svc__C': 10, 'umap__n_components': 100, 'umap__n_neighbors': 20}


Reduce to 100 components using UMAP, save in Google Sheet "AAR 100 UMAP Data"

In [None]:
manifold = umap.UMAP(n_components = 100, n_neighbors = 101, random_state=42).fit(dataset, y=numLabel)
data_reduced = manifold.transform(dataset)

In [None]:
dataset = pd.DataFrame(data_reduced)
dataset.insert(0, "Tonic", label, True) # inserting the tonic as the first column

In [None]:
count_row = dataset.shape[0]
count_col = dataset.shape[1]

# Open our sheet and add some data.
worksheet = gc.open('AAR 100 UMAP Data').sheet1
#cell_list = worksheet.range('A1:M'+str(length))

# note this outputs data from the 1st row
cell_list = worksheet.range(1,1,count_row,count_col)

for row in range(0,count_row):
    for col in range(0,count_col):
        cell_list[count_col*row + col].value = str(dataset.iloc[row,col])

worksheet.update_cells(cell_list)

{'spreadsheetId': '1oU5a2ffmy9xUUceP2A3T-thaLth3CbWZFcTJviuHgQ8',
 'updatedRange': 'Sheet1!A1:CW1634',
 'updatedRows': 1634,
 'updatedColumns': 101,
 'updatedCells': 165034}

# Dataset Formatting

In [None]:
worksheet = gc.open('AAR 100 UMAP Data').sheet1

# get_all_values gives a list of rows.
dataset = worksheet.get_all_values()

dataset = pd.DataFrame.from_records(dataset)

label = dataset[dataset. columns[0]]
label.value_counts()

C#3 Scale (1.5 Kattai)    247
D3 Scale (2 Kattai)       236
D#3 Scale (2.5 Kattai)    189
C3 Scale (1 Kattai)       183
E3 Scale (3 Kattai)       124
F3 Scale (4 Kattai)        91
G#3 Scale (5.5 Kattai)     90
G3 Scale (5 Kattai)        89
A#3 Scale (6.5 Kattai)     81
F#3 Scale (4.5 Kattai)     62
B2 Scale (7 Kattai)        59
A3 Scale (6 Kattai)        57
A2 Scale (6 Kattai)        32
E4 Scale (3 Kattai)        21
D#4 Scale (2.5 Kattai)     15
C4 Scale (1 Kattai)        15
B3 Scale (7 Kattai)        12
C#4 Scale (1.5 Kattai)     12
D4 Scale (2 Kattai)        10
A#2 Scale (6.5 Kattai)      9
Name: 0, dtype: int64

In [None]:
noList = ("A#2 Scale (6.5 Kattai)", "D4 Scale (2 Kattai)", "C#4 Scale (1.5 Kattai)", "B3 Scale (7 Kattai)", "C4 Scale (1 Kattai)",
          "D#4 Scale (2.5 Kattai)", "E4 Scale (3 Kattai)", "A2 Scale (6 Kattai)")

df = pd.DataFrame()

for i in range(1634):
  if not label[i] in noList:
    df = pd.concat((df, pd.DataFrame(dataset.iloc[i:i+1, :])))

In [None]:
df.reset_index()
tonics = df[df. columns[0]]
tonics.value_counts()

C#3 Scale (1.5 Kattai)    247
D3 Scale (2 Kattai)       236
D#3 Scale (2.5 Kattai)    189
C3 Scale (1 Kattai)       183
E3 Scale (3 Kattai)       124
F3 Scale (4 Kattai)        91
G#3 Scale (5.5 Kattai)     90
G3 Scale (5 Kattai)        89
A#3 Scale (6.5 Kattai)     81
F#3 Scale (4.5 Kattai)     62
B2 Scale (7 Kattai)        59
A3 Scale (6 Kattai)        57
Name: 0, dtype: int64

In [None]:
df = df.drop(labels = df.columns[0],axis = 1)

In [None]:
df.reset_index(drop=True)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,91,92,93,94,95,96,97,98,99,100
0,0.022981968,4.2804294,8.285394,7.2485013,1.2746494,5.759833,5.8325157,1.686016,6.1828156,3.6473215,...,5.1780515,5.149211,5.6728916,7.3934174,3.298634,5.884857,5.644476,4.8044415,5.483432,9.339743
1,0.004123352,4.0412974,8.206209,7.3083735,1.1653286,5.709118,5.7989435,1.6582698,6.081946,3.7065127,...,5.183488,5.1543846,5.6569376,7.3868213,3.290264,5.8739953,5.64826,4.8170404,5.4496717,9.352541
2,0.22400266,3.1434057,10.058858,7.059376,1.5042968,5.5203085,5.94375,2.4749625,7.7212863,1.9483044,...,5.1353207,5.1682076,5.9107647,7.536221,3.2362502,6.0487194,5.5377407,4.7038097,5.532262,9.552816
3,0.06412787,3.9325058,8.382094,6.860237,1.3337317,5.7766533,6.0650115,1.8067149,6.214318,3.4474401,...,5.152579,5.174193,5.6933975,7.402943,3.316752,5.940731,5.623779,4.7619243,5.504437,9.242134
4,0.3717064,2.3253858,9.579565,6.484855,1.434819,5.509518,6.033272,2.79304,7.3474107,2.5752242,...,5.11888,5.257889,5.8871694,7.56624,3.2434325,6.174843,5.4196963,4.5645247,5.6553917,9.425689
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1503,-0.23020922,6.2167954,9.504517,7.2032156,2.461675,6.690058,6.6478863,2.3701258,8.3511915,3.8890407,...,5.3628235,4.707222,5.9968877,7.5663266,3.1952312,5.9453053,5.7025185,4.790599,5.894288,9.659461
1504,0.71978915,1.2587576,7.8979993,6.5435596,0.992732,6.8378916,6.5510173,2.452035,7.9484086,3.2686214,...,5.2085342,5.1886835,5.4962053,7.669247,3.238259,5.8179507,5.5502315,4.7423387,5.577921,9.450812
1505,0.1432361,4.6970825,8.043038,7.904831,0.9802014,5.6607075,6.706379,2.1085148,7.1531014,5.2669153,...,5.3303394,4.8366146,5.8969483,7.483586,3.1624851,5.83112,5.671833,4.800957,5.6542263,9.666865
1506,0.3454836,2.738899,8.028954,6.2867565,1.5605189,7.019148,6.4329576,2.3131263,8.075919,2.9431515,...,5.2551074,5.1134996,5.5847197,7.616472,3.2411916,5.934906,5.5922675,4.736185,5.680413,9.457784


# Split dataset

In [None]:
# Import train_test_split function
from sklearn.model_selection import train_test_split

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(df, tonics, test_size=0.3, random_state=109) # 70% training and 30% test
# random_state = a number means that this line of code will always produce the same split of data, instead of being random each time it is run
# see https://stackoverflow.com/questions/28064634/random-state-pseudo-random-number-in-scikit-learn for more

# Balance dataset

In [None]:
# balance the dataset by undersampling the larger classes (tonics)
# see this tutorial for more info: https://towardsdatascience.com/how-to-balance-a-dataset-in-python-36dff9d12704
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
under_sampler = RandomUnderSampler(random_state=42)
X_res, y_res = under_sampler.fit_resample(X_train, y_train)
print(f"Training target statistics: {Counter(y_res)}")
print(f"Testing target statistics: {Counter(y_test)}")

Training target statistics: Counter({'A#3 Scale (6.5 Kattai)': 41, 'A3 Scale (6 Kattai)': 41, 'B2 Scale (7 Kattai)': 41, 'C#3 Scale (1.5 Kattai)': 41, 'C3 Scale (1 Kattai)': 41, 'D#3 Scale (2.5 Kattai)': 41, 'D3 Scale (2 Kattai)': 41, 'E3 Scale (3 Kattai)': 41, 'F#3 Scale (4.5 Kattai)': 41, 'F3 Scale (4 Kattai)': 41, 'G#3 Scale (5.5 Kattai)': 41, 'G3 Scale (5 Kattai)': 41})
Testing target statistics: Counter({'D3 Scale (2 Kattai)': 77, 'C#3 Scale (1.5 Kattai)': 71, 'D#3 Scale (2.5 Kattai)': 56, 'C3 Scale (1 Kattai)': 54, 'E3 Scale (3 Kattai)': 34, 'F3 Scale (4 Kattai)': 30, 'G3 Scale (5 Kattai)': 30, 'A#3 Scale (6.5 Kattai)': 25, 'G#3 Scale (5.5 Kattai)': 24, 'F#3 Scale (4.5 Kattai)': 20, 'B2 Scale (7 Kattai)': 18, 'A3 Scale (6 Kattai)': 14})


# **SVM - support vector machine**

Unbalanced

GridSearchCV: https://www.geeksforgeeks.org/svm-hyperparameter-tuning-using-gridsearchcv-ml/

In [None]:
# Import svm model
from sklearn import svm

from sklearn.model_selection import GridSearchCV

# defining parameter range
param_grid = {'C': [0.1, 0.5, 1.0, 1.5, 10, 100, 1000],
              'gamma': ['scale', 'auto', 1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['poly', 'linear', 'sigmoid', 'rbf'],
              'decision_function_shape': ['ovr']}

grid = GridSearchCV(svm.SVC(), param_grid, refit = True, verbose = 3)

# fitting the model for grid search
grid.fit(X_train, y_train)

Fitting 5 folds for each of 196 candidates, totalling 980 fits
[CV 1/5] END C=0.1, decision_function_shape=ovr, gamma=scale, kernel=poly;, score=0.943 total time=   0.1s
[CV 2/5] END C=0.1, decision_function_shape=ovr, gamma=scale, kernel=poly;, score=0.962 total time=   0.1s
[CV 3/5] END C=0.1, decision_function_shape=ovr, gamma=scale, kernel=poly;, score=0.957 total time=   0.0s
[CV 4/5] END C=0.1, decision_function_shape=ovr, gamma=scale, kernel=poly;, score=0.962 total time=   0.1s
[CV 5/5] END C=0.1, decision_function_shape=ovr, gamma=scale, kernel=poly;, score=0.924 total time=   0.1s
[CV 1/5] END C=0.1, decision_function_shape=ovr, gamma=scale, kernel=linear;, score=0.943 total time=   0.1s
[CV 2/5] END C=0.1, decision_function_shape=ovr, gamma=scale, kernel=linear;, score=0.967 total time=   0.0s
[CV 3/5] END C=0.1, decision_function_shape=ovr, gamma=scale, kernel=linear;, score=0.962 total time=   0.0s
[CV 4/5] END C=0.1, decision_function_shape=ovr, gamma=scale, kernel=linear

In [None]:
# print best parameter after tuning
print(grid.best_params_)

# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

{'C': 1.0, 'decision_function_shape': 'ovr', 'gamma': 'auto', 'kernel': 'poly'}
SVC(gamma='auto', kernel='poly')


In [None]:
from sklearn import metrics

grid_predictions = grid.predict(X_test)

# print classification report and accuracy, Matthews Correlaction Coefficient
print(metrics.classification_report(y_test, grid_predictions))
print("Accuracy:", metrics.accuracy_score(y_test, grid_predictions))
print("MCC:", metrics.matthews_corrcoef(y_test, grid_predictions))

                        precision    recall  f1-score   support

A#3 Scale (6.5 Kattai)       1.00      1.00      1.00        25
   A3 Scale (6 Kattai)       1.00      1.00      1.00        14
   B2 Scale (7 Kattai)       1.00      0.83      0.91        18
C#3 Scale (1.5 Kattai)       0.99      1.00      0.99        71
   C3 Scale (1 Kattai)       1.00      1.00      1.00        54
D#3 Scale (2.5 Kattai)       1.00      1.00      1.00        56
   D3 Scale (2 Kattai)       1.00      1.00      1.00        77
   E3 Scale (3 Kattai)       0.97      1.00      0.99        34
F#3 Scale (4.5 Kattai)       0.91      1.00      0.95        20
   F3 Scale (4 Kattai)       1.00      0.97      0.98        30
G#3 Scale (5.5 Kattai)       1.00      1.00      1.00        24
   G3 Scale (5 Kattai)       1.00      1.00      1.00        30

              accuracy                           0.99       453
             macro avg       0.99      0.98      0.99       453
          weighted avg       0.99     

Balanced

In [None]:
# Import svm model
from sklearn import svm

from sklearn.model_selection import GridSearchCV

# defining parameter range
param_grid = {'C': [0.1, 0.5, 1.0, 1.5, 10, 100, 1000],
              'gamma': ['scale', 'auto', 1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['linear', 'poly', 'sigmoid', 'rbf'],
              'decision_function_shape': ['ovr']}

grid = GridSearchCV(svm.SVC(), param_grid, refit = True, verbose = 3)

# fitting the model for grid search
grid.fit(X_res, y_res)

Fitting 5 folds for each of 196 candidates, totalling 980 fits
[CV 1/5] END C=0.1, decision_function_shape=ovr, gamma=scale, kernel=linear;, score=0.929 total time=   0.0s
[CV 2/5] END C=0.1, decision_function_shape=ovr, gamma=scale, kernel=linear;, score=0.939 total time=   0.0s
[CV 3/5] END C=0.1, decision_function_shape=ovr, gamma=scale, kernel=linear;, score=0.898 total time=   0.0s
[CV 4/5] END C=0.1, decision_function_shape=ovr, gamma=scale, kernel=linear;, score=0.949 total time=   0.0s
[CV 5/5] END C=0.1, decision_function_shape=ovr, gamma=scale, kernel=linear;, score=0.898 total time=   0.0s
[CV 1/5] END C=0.1, decision_function_shape=ovr, gamma=scale, kernel=poly;, score=0.919 total time=   0.0s
[CV 2/5] END C=0.1, decision_function_shape=ovr, gamma=scale, kernel=poly;, score=0.949 total time=   0.0s
[CV 3/5] END C=0.1, decision_function_shape=ovr, gamma=scale, kernel=poly;, score=0.898 total time=   0.0s
[CV 4/5] END C=0.1, decision_function_shape=ovr, gamma=scale, kernel=po

In [None]:
# print best parameter after tuning
print(grid.best_params_)

# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

{'C': 0.5, 'decision_function_shape': 'ovr', 'gamma': 'auto', 'kernel': 'poly'}
SVC(C=0.5, gamma='auto', kernel='poly')


In [None]:
grid_predictions = grid.predict(X_test)

# print classification report
print(metrics.classification_report(y_test, grid_predictions))
print("Accuracy:", metrics.accuracy_score(y_test, grid_predictions))
print("MCC:", metrics.matthews_corrcoef(y_test, grid_predictions))

                        precision    recall  f1-score   support

A#3 Scale (6.5 Kattai)       1.00      1.00      1.00        25
   A3 Scale (6 Kattai)       1.00      1.00      1.00        14
   B2 Scale (7 Kattai)       1.00      0.83      0.91        18
C#3 Scale (1.5 Kattai)       0.99      1.00      0.99        71
   C3 Scale (1 Kattai)       1.00      0.96      0.98        54
D#3 Scale (2.5 Kattai)       1.00      1.00      1.00        56
   D3 Scale (2 Kattai)       1.00      0.96      0.98        77
   E3 Scale (3 Kattai)       0.92      1.00      0.96        34
F#3 Scale (4.5 Kattai)       0.91      1.00      0.95        20
   F3 Scale (4 Kattai)       0.91      0.97      0.94        30
G#3 Scale (5.5 Kattai)       1.00      1.00      1.00        24
   G3 Scale (5 Kattai)       1.00      1.00      1.00        30

              accuracy                           0.98       453
             macro avg       0.98      0.98      0.98       453
          weighted avg       0.98     

# **KNN - k-nearest neighbors**

In [None]:
# Scale the features using StandardScaler
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

Unbalanced

GridSearchCV: https://machinelearningknowledge.ai/knn-classifier-in-sklearn-using-gridsearchcv-with-example/#vii_Model_fitting_with_K-cross_Validation_and_GridSearchCV

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

k_range = list(range(1, 31))
param_grid = dict(n_neighbors=k_range)

# defining parameter range
grid = GridSearchCV(KNeighborsClassifier(), param_grid, cv=10, scoring='accuracy', return_train_score=False, verbose=1)

# fitting the model for grid search
grid.fit(X_train, y_train)

Fitting 10 folds for each of 30 candidates, totalling 300 fits


In [None]:
# print best parameter after tuning
print(grid.best_params_)

# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

{'n_neighbors': 1}
KNeighborsClassifier(n_neighbors=1)


In [None]:
grid_predictions = grid.predict(X_test)

# print classification report
print(metrics.classification_report(y_test, grid_predictions))
print("Accuracy:", metrics.accuracy_score(y_test, grid_predictions))
print("MCC:", metrics.matthews_corrcoef(y_test, grid_predictions))

                        precision    recall  f1-score   support

A#3 Scale (6.5 Kattai)       1.00      1.00      1.00        25
   A3 Scale (6 Kattai)       1.00      1.00      1.00        14
   B2 Scale (7 Kattai)       1.00      0.94      0.97        18
C#3 Scale (1.5 Kattai)       0.99      1.00      0.99        71
   C3 Scale (1 Kattai)       1.00      1.00      1.00        54
D#3 Scale (2.5 Kattai)       1.00      1.00      1.00        56
   D3 Scale (2 Kattai)       1.00      1.00      1.00        77
   E3 Scale (3 Kattai)       0.97      1.00      0.99        34
F#3 Scale (4.5 Kattai)       1.00      1.00      1.00        20
   F3 Scale (4 Kattai)       1.00      0.97      0.98        30
G#3 Scale (5.5 Kattai)       1.00      1.00      1.00        24
   G3 Scale (5 Kattai)       1.00      1.00      1.00        30

              accuracy                           1.00       453
             macro avg       1.00      0.99      0.99       453
          weighted avg       1.00     

Balanced

In [None]:
# Scale the features using StandardScaler
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_res = scaler.fit_transform(X_res)
X_test = scaler.transform(X_test)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

k_range = list(range(1, 31))
param_grid = dict(n_neighbors=k_range)

# defining parameter range
grid = GridSearchCV(KNeighborsClassifier(), param_grid, cv=10, scoring='accuracy', return_train_score=False, verbose=1)

# fitting the model for grid search
grid.fit(X_res, y_res)

Fitting 10 folds for each of 30 candidates, totalling 300 fits


In [None]:
# print best parameter after tuning
print(grid.best_params_)

# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

{'n_neighbors': 1}
KNeighborsClassifier(n_neighbors=1)


In [None]:
grid_predictions = grid.predict(X_test)

# print classification report
print(metrics.classification_report(y_test, grid_predictions))
print("Accuracy:", metrics.accuracy_score(y_test, grid_predictions))
print("MCC:", metrics.matthews_corrcoef(y_test, grid_predictions))

                        precision    recall  f1-score   support

A#3 Scale (6.5 Kattai)       1.00      1.00      1.00        25
   A3 Scale (6 Kattai)       1.00      1.00      1.00        14
   B2 Scale (7 Kattai)       1.00      0.94      0.97        18
C#3 Scale (1.5 Kattai)       0.99      1.00      0.99        71
   C3 Scale (1 Kattai)       0.95      0.96      0.95        54
D#3 Scale (2.5 Kattai)       1.00      1.00      1.00        56
   D3 Scale (2 Kattai)       1.00      0.96      0.98        77
   E3 Scale (3 Kattai)       0.92      1.00      0.96        34
F#3 Scale (4.5 Kattai)       1.00      1.00      1.00        20
   F3 Scale (4 Kattai)       1.00      0.97      0.98        30
G#3 Scale (5.5 Kattai)       1.00      1.00      1.00        24
   G3 Scale (5 Kattai)       1.00      1.00      1.00        30

              accuracy                           0.98       453
             macro avg       0.99      0.99      0.99       453
          weighted avg       0.99     

# **DecisionTreeClassifier**

documentation: https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

Unbalanced

GridSeachCV: https://plainenglish.io/blog/hyperparameter-tuning-of-decision-tree-classifier-using-gridsearchcv-2a6ebcaffeda

In [None]:
# Import model
from sklearn import tree

# defining parameter range
param_grid = { 'criterion': ['gini', 'entropy'],
               'max_depth': range(1,10),
               'min_samples_split': range(1,10),
               'min_samples_leaf': range(1,5) }

grid = GridSearchCV(tree.DecisionTreeClassifier(random_state=3), param_grid, refit = True, cv=10, verbose=1)

# fitting the model for grid search
grid.fit(X_train, y_train)

Fitting 10 folds for each of 648 candidates, totalling 6480 fits


720 fits failed out of a total of 6480.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
720 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/tree/_classes.py", line 889, in fit
    super().fit(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/tree/_classes.py", line 177, in fit
    self._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklear

In [None]:
# print best parameter after tuning
print(grid.best_params_)

# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

{'criterion': 'entropy', 'max_depth': 7, 'min_samples_leaf': 1, 'min_samples_split': 7}
DecisionTreeClassifier(criterion='entropy', max_depth=7, min_samples_split=7,
                       random_state=3)


In [None]:
grid_predictions = grid.predict(X_test)

# print classification report
print(metrics.classification_report(y_test, grid_predictions))
print("Accuracy:", metrics.accuracy_score(y_test, grid_predictions))
print("MCC:", metrics.matthews_corrcoef(y_test, grid_predictions))

                        precision    recall  f1-score   support

A#3 Scale (6.5 Kattai)       0.93      1.00      0.96        25
   A3 Scale (6 Kattai)       1.00      0.93      0.96        14
   B2 Scale (7 Kattai)       0.92      0.67      0.77        18
C#3 Scale (1.5 Kattai)       0.95      1.00      0.97        71
   C3 Scale (1 Kattai)       0.96      0.96      0.96        54
D#3 Scale (2.5 Kattai)       1.00      1.00      1.00        56
   D3 Scale (2 Kattai)       1.00      0.99      0.99        77
   E3 Scale (3 Kattai)       0.92      1.00      0.96        34
F#3 Scale (4.5 Kattai)       0.90      0.90      0.90        20
   F3 Scale (4 Kattai)       0.94      0.97      0.95        30
G#3 Scale (5.5 Kattai)       1.00      0.96      0.98        24
   G3 Scale (5 Kattai)       1.00      0.93      0.97        30

              accuracy                           0.96       453
             macro avg       0.96      0.94      0.95       453
          weighted avg       0.97     

Balanced

GridSeachCV: https://plainenglish.io/blog/hyperparameter-tuning-of-decision-tree-classifier-using-gridsearchcv-2a6ebcaffeda

In [None]:
# Import model
from sklearn import tree
from sklearn.model_selection import GridSearchCV

# defining parameter range
param_grid = { 'criterion': ['gini', 'entropy'],
               'max_depth': range(1,10),
               'min_samples_split': range(1,10),
               'min_samples_leaf': range(1,5) }

grid = GridSearchCV(tree.DecisionTreeClassifier(random_state=3), param_grid, refit = True, cv=10, verbose=1)

# fitting the model for grid search
grid.fit(X_res, y_res)

Fitting 10 folds for each of 648 candidates, totalling 6480 fits


720 fits failed out of a total of 6480.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
720 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/tree/_classes.py", line 889, in fit
    super().fit(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/tree/_classes.py", line 177, in fit
    self._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklear

In [None]:
# print best parameter after tuning
print(grid.best_params_)

# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

{'criterion': 'entropy', 'max_depth': 6, 'min_samples_leaf': 1, 'min_samples_split': 5}
DecisionTreeClassifier(criterion='entropy', max_depth=6, min_samples_split=5,
                       random_state=3)


In [None]:
grid_predictions = grid.predict(X_test)

# print classification report
print(metrics.classification_report(y_test, grid_predictions))
print("Accuracy:", metrics.accuracy_score(y_test, grid_predictions))
print("MCC:", metrics.matthews_corrcoef(y_test, grid_predictions))

                        precision    recall  f1-score   support

A#3 Scale (6.5 Kattai)       1.00      1.00      1.00        25
   A3 Scale (6 Kattai)       1.00      0.93      0.96        14
   B2 Scale (7 Kattai)       1.00      0.78      0.88        18
C#3 Scale (1.5 Kattai)       1.00      1.00      1.00        71
   C3 Scale (1 Kattai)       1.00      0.96      0.98        54
D#3 Scale (2.5 Kattai)       1.00      0.98      0.99        56
   D3 Scale (2 Kattai)       1.00      0.96      0.98        77
   E3 Scale (3 Kattai)       0.94      1.00      0.97        34
F#3 Scale (4.5 Kattai)       0.95      1.00      0.98        20
   F3 Scale (4 Kattai)       0.94      0.97      0.95        30
G#3 Scale (5.5 Kattai)       0.75      1.00      0.86        24
   G3 Scale (5 Kattai)       1.00      0.97      0.98        30

              accuracy                           0.97       453
             macro avg       0.97      0.96      0.96       453
          weighted avg       0.98     

# **RandomForestClassifier**

documentation: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

Unbalanced

GridSeachCV: https://www.kaggle.com/code/sociopath00/random-forest-using-gridsearchcv/notebook

In [None]:
# Import model
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# defining parameter range
param_grid = {
    'n_estimators': [200, 500],
    'max_features': [None, 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}

grid = GridSearchCV(RandomForestClassifier(random_state=42), param_grid=param_grid, cv= 5)
grid.fit(X_train, y_train)

In [None]:
# print best parameter after tuning
print(grid.best_params_)

# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

{'criterion': 'entropy', 'max_depth': 6, 'max_features': 'sqrt', 'n_estimators': 200}
RandomForestClassifier(criterion='entropy', max_depth=6, n_estimators=200,
                       random_state=42)


In [None]:
grid_predictions = grid.predict(X_test)

# print classification report
print(metrics.classification_report(y_test, grid_predictions))
print("Accuracy:", metrics.accuracy_score(y_test, grid_predictions))
print("MCC:", metrics.matthews_corrcoef(y_test, grid_predictions))

                        precision    recall  f1-score   support

A#3 Scale (6.5 Kattai)       1.00      1.00      1.00        25
   A3 Scale (6 Kattai)       1.00      1.00      1.00        14
   B2 Scale (7 Kattai)       1.00      0.94      0.97        18
C#3 Scale (1.5 Kattai)       0.99      1.00      0.99        71
   C3 Scale (1 Kattai)       1.00      1.00      1.00        54
D#3 Scale (2.5 Kattai)       1.00      1.00      1.00        56
   D3 Scale (2 Kattai)       1.00      1.00      1.00        77
   E3 Scale (3 Kattai)       0.97      1.00      0.99        34
F#3 Scale (4.5 Kattai)       1.00      1.00      1.00        20
   F3 Scale (4 Kattai)       1.00      0.97      0.98        30
G#3 Scale (5.5 Kattai)       1.00      1.00      1.00        24
   G3 Scale (5 Kattai)       1.00      1.00      1.00        30

              accuracy                           1.00       453
             macro avg       1.00      0.99      0.99       453
          weighted avg       1.00     

Balanced

In [None]:
# Import model
from sklearn.ensemble import RandomForestClassifier

# defining parameter range
param_grid = {
    'n_estimators': [64, 100, 200, 500],
    'max_features': [None, 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}

grid = GridSearchCV(RandomForestClassifier(random_state=42), param_grid=param_grid, cv= 5)
grid.fit(X_res, y_res)

In [None]:
# print best parameter after tuning
print(grid.best_params_)

# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

{'criterion': 'gini', 'max_depth': 6, 'max_features': 'log2', 'n_estimators': 100}
RandomForestClassifier(max_depth=6, max_features='log2', random_state=42)


In [None]:
grid_predictions = grid.predict(X_test)

# print classification report
print(metrics.classification_report(y_test, grid_predictions))
print("Accuracy:", metrics.accuracy_score(y_test, grid_predictions))
print("MCC:", metrics.matthews_corrcoef(y_test, grid_predictions))

                        precision    recall  f1-score   support

A#3 Scale (6.5 Kattai)       1.00      1.00      1.00        25
   A3 Scale (6 Kattai)       0.93      1.00      0.97        14
   B2 Scale (7 Kattai)       0.95      1.00      0.97        18
C#3 Scale (1.5 Kattai)       1.00      1.00      1.00        71
   C3 Scale (1 Kattai)       0.95      0.96      0.95        54
D#3 Scale (2.5 Kattai)       0.98      1.00      0.99        56
   D3 Scale (2 Kattai)       1.00      0.96      0.98        77
   E3 Scale (3 Kattai)       0.97      1.00      0.99        34
F#3 Scale (4.5 Kattai)       1.00      0.90      0.95        20
   F3 Scale (4 Kattai)       1.00      0.97      0.98        30
G#3 Scale (5.5 Kattai)       0.96      1.00      0.98        24
   G3 Scale (5 Kattai)       1.00      1.00      1.00        30

              accuracy                           0.98       453
             macro avg       0.98      0.98      0.98       453
          weighted avg       0.98     

# **GradientBoostingClassifier**

documentation: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html

Unbalanced

In [None]:
# defining parameter range
param_grid = {
    "loss":["deviance"],
    "learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
    "min_samples_split": np.linspace(0.1, 0.5, 12),
    "min_samples_leaf": np.linspace(0.1, 0.5, 12),
    "max_depth":[3,5,8],
    "max_features":["log2","sqrt"],
    "criterion": ["friedman_mse",  "mae"],
    "subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
    "n_estimators":[10]
    }


In [None]:
from sklearn.ensemble import GradientBoostingClassifier
# Create Gradient Boosting classifer object
clf = GradientBoostingClassifier(random_state=42, loss='log_loss', learning_rate=0.2,
                                 n_estimators=100, subsample=1.0, criterion='friedman_mse',
                                 min_samples_split=2, min_samples_leaf=0.2,
                                 max_depth=5, max_features='sqrt')

# Train classifer
clf = clf.fit(X_train,y_train)

In [None]:
#Predict the response for test dataset
y_pred = clf.predict(X_test)

In [None]:
# print classification report
print(metrics.classification_report(y_test, y_pred))
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("MCC:", metrics.matthews_corrcoef(y_test, y_pred))

                        precision    recall  f1-score   support

A#3 Scale (6.5 Kattai)       1.00      1.00      1.00        25
   A3 Scale (6 Kattai)       0.93      1.00      0.97        14
   B2 Scale (7 Kattai)       1.00      0.94      0.97        18
C#3 Scale (1.5 Kattai)       1.00      1.00      1.00        71
   C3 Scale (1 Kattai)       1.00      1.00      1.00        54
D#3 Scale (2.5 Kattai)       1.00      1.00      1.00        56
   D3 Scale (2 Kattai)       1.00      1.00      1.00        77
   E3 Scale (3 Kattai)       0.97      1.00      0.99        34
F#3 Scale (4.5 Kattai)       1.00      1.00      1.00        20
   F3 Scale (4 Kattai)       1.00      0.97      0.98        30
G#3 Scale (5.5 Kattai)       1.00      1.00      1.00        24
   G3 Scale (5 Kattai)       1.00      1.00      1.00        30

              accuracy                           1.00       453
             macro avg       0.99      0.99      0.99       453
          weighted avg       1.00     

Balanced

In [None]:
# Create Gradient Boosting classifer object
clf = GradientBoostingClassifier(random_state=42, loss='log_loss', learning_rate=0.2,
                                 n_estimators=100, subsample=1.0, criterion='friedman_mse',
                                 min_samples_split=0.1, min_samples_leaf=0.3,
                                 max_depth=8, max_features='sqrt')

# Train classifer
clf = clf.fit(X_res,y_res)

In [None]:
#Predict the response for test dataset
y_pred = clf.predict(X_test)

# print classification report
print(metrics.classification_report(y_test, y_pred))
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("MCC:", metrics.matthews_corrcoef(y_test, y_pred))

                        precision    recall  f1-score   support

A#3 Scale (6.5 Kattai)       1.00      1.00      1.00        25
   A3 Scale (6 Kattai)       1.00      1.00      1.00        14
   B2 Scale (7 Kattai)       1.00      0.94      0.97        18
C#3 Scale (1.5 Kattai)       0.99      1.00      0.99        71
   C3 Scale (1 Kattai)       1.00      0.96      0.98        54
D#3 Scale (2.5 Kattai)       1.00      1.00      1.00        56
   D3 Scale (2 Kattai)       1.00      0.96      0.98        77
   E3 Scale (3 Kattai)       0.94      1.00      0.97        34
F#3 Scale (4.5 Kattai)       1.00      1.00      1.00        20
   F3 Scale (4 Kattai)       0.91      0.97      0.94        30
G#3 Scale (5.5 Kattai)       0.96      1.00      0.98        24
   G3 Scale (5 Kattai)       1.00      1.00      1.00        30

              accuracy                           0.98       453
             macro avg       0.98      0.99      0.98       453
          weighted avg       0.99     

# **AdaBoost**

Unbalanced

In [None]:
from sklearn.ensemble import AdaBoostClassifier

clf = AdaBoostClassifier(n_estimators=250, learning_rate=0.2, algorithm='SAMME.R',
                         random_state=3)

clf.fit(X_train, y_train)

In [None]:
#Predict the response for test dataset
y_pred = clf.predict(X_test)

# print classification report
print(metrics.classification_report(y_test, y_pred))
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("MCC:", metrics.matthews_corrcoef(y_test, y_pred))

                        precision    recall  f1-score   support

A#3 Scale (6.5 Kattai)       1.00      1.00      1.00        25
   A3 Scale (6 Kattai)       1.00      1.00      1.00        14
   B2 Scale (7 Kattai)       1.00      0.89      0.94        18
C#3 Scale (1.5 Kattai)       1.00      1.00      1.00        71
   C3 Scale (1 Kattai)       0.93      1.00      0.96        54
D#3 Scale (2.5 Kattai)       1.00      1.00      1.00        56
   D3 Scale (2 Kattai)       1.00      0.97      0.99        77
   E3 Scale (3 Kattai)       0.97      1.00      0.99        34
F#3 Scale (4.5 Kattai)       0.83      1.00      0.91        20
   F3 Scale (4 Kattai)       0.96      0.90      0.93        30
G#3 Scale (5.5 Kattai)       0.95      0.83      0.89        24
   G3 Scale (5 Kattai)       1.00      1.00      1.00        30

              accuracy                           0.98       453
             macro avg       0.97      0.97      0.97       453
          weighted avg       0.98     

Balanced

In [None]:
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier(n_estimators=200, learning_rate=0.2, algorithm='SAMME.R',
                         random_state=3)

clf.fit(X_res, y_res)

In [None]:
#Predict the response for test dataset
y_pred = clf.predict(X_test)

# print classification report
print(metrics.classification_report(y_test, y_pred))
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("MCC:", metrics.matthews_corrcoef(y_test, y_pred))

                        precision    recall  f1-score   support

A#3 Scale (6.5 Kattai)       0.96      1.00      0.98        25
   A3 Scale (6 Kattai)       1.00      0.93      0.96        14
   B2 Scale (7 Kattai)       1.00      0.89      0.94        18
C#3 Scale (1.5 Kattai)       1.00      1.00      1.00        71
   C3 Scale (1 Kattai)       1.00      0.61      0.76        54
D#3 Scale (2.5 Kattai)       1.00      1.00      1.00        56
   D3 Scale (2 Kattai)       1.00      0.94      0.97        77
   E3 Scale (3 Kattai)       0.92      1.00      0.96        34
F#3 Scale (4.5 Kattai)       0.91      1.00      0.95        20
   F3 Scale (4 Kattai)       0.55      0.97      0.70        30
G#3 Scale (5.5 Kattai)       1.00      1.00      1.00        24
   G3 Scale (5 Kattai)       1.00      1.00      1.00        30

              accuracy                           0.93       453
             macro avg       0.94      0.94      0.93       453
          weighted avg       0.96     

# Export best model
100 components of Librosa MFCCs (using UMAP)

RandomForestClassifier

Unbalanced dataset

'criterion': 'entropy', 'max_depth': 6, 'max_features': 'sqrt', 'n_estimators': 200

In [None]:
# Import model
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(criterion='entropy', max_depth=6, max_features='sqrt', n_estimators=200, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
# print classification report
print(metrics.classification_report(y_test, y_pred))
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("MCC:", metrics.matthews_corrcoef(y_test, y_pred))

                        precision    recall  f1-score   support

A#3 Scale (6.5 Kattai)       1.00      1.00      1.00        25
   A3 Scale (6 Kattai)       1.00      1.00      1.00        14
   B2 Scale (7 Kattai)       1.00      0.94      0.97        18
C#3 Scale (1.5 Kattai)       0.99      1.00      0.99        71
   C3 Scale (1 Kattai)       1.00      1.00      1.00        54
D#3 Scale (2.5 Kattai)       1.00      1.00      1.00        56
   D3 Scale (2 Kattai)       1.00      1.00      1.00        77
   E3 Scale (3 Kattai)       0.97      1.00      0.99        34
F#3 Scale (4.5 Kattai)       1.00      1.00      1.00        20
   F3 Scale (4 Kattai)       1.00      0.97      0.98        30
G#3 Scale (5.5 Kattai)       1.00      1.00      1.00        24
   G3 Scale (5 Kattai)       1.00      1.00      1.00        30

              accuracy                           1.00       453
             macro avg       1.00      0.99      0.99       453
          weighted avg       1.00     

In [None]:
# save model
joblib.dump(rf, "/content/drive/MyDrive/best_random_forest_100UMAP_LibrosaMFCC_unbalanced.joblib")

['/content/drive/MyDrive/best_random_forest_100UMAP_LibrosaMFCC_unbalanced.joblib']

# Export best model
100 components of Librosa MFCCs (using UMAP)

Support Vector Machine (SVM)

Unbalanced dataset

'C': 1.0, 'decision_function_shape': 'ovr', 'gamma': 'auto', 'kernel': 'poly'

In [None]:
# Import model
from sklearn import svm
svm_model = svm.SVC(C=0.5, gamma='auto', kernel='poly', random_state=42)
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)
# print classification report
print(metrics.classification_report(y_test, y_pred))
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("MCC:", metrics.matthews_corrcoef(y_test, y_pred))

                        precision    recall  f1-score   support

A#3 Scale (6.5 Kattai)       1.00      1.00      1.00        25
   A3 Scale (6 Kattai)       1.00      1.00      1.00        14
   B2 Scale (7 Kattai)       1.00      0.83      0.91        18
C#3 Scale (1.5 Kattai)       0.99      1.00      0.99        71
   C3 Scale (1 Kattai)       1.00      1.00      1.00        54
D#3 Scale (2.5 Kattai)       1.00      1.00      1.00        56
   D3 Scale (2 Kattai)       1.00      1.00      1.00        77
   E3 Scale (3 Kattai)       0.97      1.00      0.99        34
F#3 Scale (4.5 Kattai)       0.91      1.00      0.95        20
   F3 Scale (4 Kattai)       1.00      0.97      0.98        30
G#3 Scale (5.5 Kattai)       1.00      1.00      1.00        24
   G3 Scale (5 Kattai)       1.00      1.00      1.00        30

              accuracy                           0.99       453
             macro avg       0.99      0.98      0.99       453
          weighted avg       0.99     

In [None]:
# save model
joblib.dump(rf, "/content/drive/MyDrive/best_SVM_100UMAP_LibrosaMFCC_unbalanced.joblib")

['/content/drive/MyDrive/best_SVM_100UMAP_LibrosaMFCC_unbalanced.joblib']

# Export best model
100 components of Librosa MFCCs (using UMAP)

Gradient Boosting Classifier

Unbalanced dataset

loss='log_loss', learning_rate=0.2,
                                 n_estimators=100, subsample=1.0, criterion='friedman_mse',
                                 min_samples_split=2, min_samples_leaf=0.2,
                                 max_depth=5, max_features='sqrt'

In [None]:
# Import model
from sklearn.ensemble import GradientBoostingClassifier
# Create Gradient Boosting classifer object
clf = GradientBoostingClassifier(random_state=42, loss='log_loss', learning_rate=0.2,
                                 n_estimators=100, subsample=1.0, criterion='friedman_mse',
                                 min_samples_split=2, min_samples_leaf=0.2,
                                 max_depth=5, max_features='sqrt')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
# print classification report
print(metrics.classification_report(y_test, y_pred))
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("MCC:", metrics.matthews_corrcoef(y_test, y_pred))

                        precision    recall  f1-score   support

A#3 Scale (6.5 Kattai)       1.00      1.00      1.00        25
   A3 Scale (6 Kattai)       0.93      1.00      0.97        14
   B2 Scale (7 Kattai)       1.00      0.94      0.97        18
C#3 Scale (1.5 Kattai)       1.00      1.00      1.00        71
   C3 Scale (1 Kattai)       1.00      1.00      1.00        54
D#3 Scale (2.5 Kattai)       1.00      1.00      1.00        56
   D3 Scale (2 Kattai)       1.00      1.00      1.00        77
   E3 Scale (3 Kattai)       0.97      1.00      0.99        34
F#3 Scale (4.5 Kattai)       1.00      1.00      1.00        20
   F3 Scale (4 Kattai)       1.00      0.97      0.98        30
G#3 Scale (5.5 Kattai)       1.00      1.00      1.00        24
   G3 Scale (5 Kattai)       1.00      1.00      1.00        30

              accuracy                           1.00       453
             macro avg       0.99      0.99      0.99       453
          weighted avg       1.00     

In [None]:
# save model
joblib.dump(rf, "/content/drive/MyDrive/best_GradientBoostingClf_100UMAP_LibrosaMFCC_unbalanced.joblib")

['/content/drive/MyDrive/best_GradientBoostingClf_100UMAP_LibrosaMFCC_unbalanced.joblib']