In [1]:
import tensorflow as tf
import numpy as np

print("TensorFlow version:", tf.__version__)
print("NumPy version:", np.__version__)
print("GPU Available:", tf.config.list_physical_devices('GPU'))

2025-05-01 13:02:43.450721: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-01 13:02:43.773507: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


TensorFlow version: 2.12.1
NumPy version: 1.23.5
GPU Available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [10]:
# import os
# os.environ["TF_XLA_FLAGS"] = "--tf_xla_auto_jit=0"


import os
import librosa
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import joblib
import os

In [3]:
# loading Yamnet and setting directories

In [4]:
import os

# Set TFHub cache directory to a local folder you own
os.environ['TFHUB_CACHE_DIR'] = './.tfhub_cache'

In [5]:
AUDIO_DIR = "../musicData/fma_small"
CSV_PATH = "../data/metadata.csv"
OUT_DIR = "../features/yamnet_embeddings"
YAMNET_MODEL = 'https://tfhub.dev/google/yamnet/1'

os.makedirs(OUT_DIR, exist_ok=True)

yamnet = hub.load(YAMNET_MODEL)
print("YAMNet model loaded")

2025-05-01 13:03:14.465527: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 11569 MB memory:  -> device: 0, name: NVIDIA A100 80GB PCIe, pci bus id: 0000:03:00.0, compute capability: 8.0
2025-05-01 13:03:14.866884: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs_3' with dtype int32 and shape [?]
	 [[{{node inputs_3}}]]
2025-05-01 13:03:14.876569: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs_1' with dtype int32 and shape [3]
	 [[{{node inputs_1}}]]
2025-05-01 13:03:14.876699: I tensorflow/core/common_runtime/executor.cc:1197] [/de

YAMNet model loaded


In [7]:
waveform, sr = librosa.load("../musicData/fma_small/000/000002.mp3", sr=16000)
scores, embeddings, spectrogram = yamnet(waveform)

print("Embedding shape:", embeddings.shape)
print("Sample mean vector:", tf.reduce_mean(embeddings, axis=0).numpy()[:10])

2025-05-01 13:06:44.378770: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'waveform' with dtype float and shape [?]
	 [[{{node waveform}}]]
2025-05-01 13:06:45.240556: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:637] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2025-05-01 13:06:45.378550: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:424] Loaded cuDNN version 8907


Embedding shape: (62, 1024)
Sample mean vector: [7.1898277e-04 2.6383866e-03 6.1445320e-01 1.8966133e-02 2.5788273e-03
 0.0000000e+00 1.5048351e-04 1.3661080e-02 1.3728974e-03 5.5781973e-04]


In [9]:
# Extract embeddings
def extract_embedding(audio_path):
    waveform, sr = librosa.load(audio_path, sr=16000)
    _, embeddings, _ = yamnet(waveform)
    return tf.reduce_mean(embeddings, axis=0).numpy()

df = pd.read_csv(CSV_PATH)
X = []
y = []

for i, row in df.iterrows():
    track_id = str(row['track_id']).zfill(6)
    genre = row['genre']
    folder = track_id[:3]
    filepath = os.path.join(AUDIO_DIR, folder, f"{track_id}.mp3")

    if not os.path.exists(filepath):
        continue

    try:
        emb = extract_embedding(filepath)
        X.append(emb)
        y.append(genre)
        print(f"{i+1}: {track_id} â†’ {genre}")
    except Exception as e:
        print(f"Error with {filepath}: {e}")

os.makedirs("../features/yamnet_embeddings", exist_ok=True)
np.save("../features/yamnet_embeddings/X_embeddings.npy", np.array(X))
np.save("../features/yamnet_embeddings/y_labels.npy", np.array(y))
print("Saved X_embeddings.npy and y_labels.npy!")

1: 000002 â†’ Hip-Hop
2: 000005 â†’ Hip-Hop
3: 000010 â†’ Pop
4: 000140 â†’ Folk
5: 000141 â†’ Folk
6: 000148 â†’ Experimental
7: 000182 â†’ Rock
8: 000190 â†’ Folk
9: 000193 â†’ Folk
10: 000194 â†’ Folk
11: 000197 â†’ Folk
12: 000200 â†’ Folk
13: 000203 â†’ Folk
14: 000204 â†’ Folk
15: 000207 â†’ Folk
16: 000210 â†’ Folk
17: 000211 â†’ Folk
18: 000212 â†’ Folk
19: 000213 â†’ Pop
20: 000255 â†’ Rock
21: 000256 â†’ Rock
22: 000368 â†’ Rock
23: 000424 â†’ Experimental
24: 000459 â†’ Rock
25: 000534 â†’ Folk
26: 000540 â†’ Folk
27: 000546 â†’ Folk
28: 000574 â†’ Rock
29: 000602 â†’ Folk
30: 000615 â†’ Experimental
31: 000620 â†’ Folk
32: 000621 â†’ Folk
33: 000625 â†’ Folk
34: 000666 â†’ International
35: 000667 â†’ International
36: 000676 â†’ Hip-Hop
37: 000690 â†’ Rock
38: 000694 â†’ Hip-Hop
39: 000695 â†’ Hip-Hop
40: 000704 â†’ International
41: 000705 â†’ International
42: 000706 â†’ International
43: 000707 â†’ International
44: 000708 â†’ International
45: 000709 â†’ International


[src/libmpg123/layer3.c:INT123_do_layer3():1948] error: dequantization failed!


491: 011298 â†’ Hip-Hop
492: 011299 â†’ Hip-Hop
493: 011306 â†’ Rock
494: 011333 â†’ Experimental
495: 011334 â†’ Experimental
496: 011503 â†’ Folk
497: 011504 â†’ Folk
498: 011505 â†’ Folk
499: 011508 â†’ Folk
500: 011544 â†’ Experimental
501: 011638 â†’ Instrumental
502: 011671 â†’ International
503: 011672 â†’ International
504: 011673 â†’ International
505: 011674 â†’ International
506: 011675 â†’ International
507: 011677 â†’ International
508: 011679 â†’ International
509: 011681 â†’ International
510: 011682 â†’ International
511: 011683 â†’ International
512: 011763 â†’ Hip-Hop
513: 011764 â†’ Hip-Hop
514: 011765 â†’ Hip-Hop
515: 011766 â†’ Hip-Hop
516: 011767 â†’ Hip-Hop
517: 011768 â†’ Hip-Hop
518: 011769 â†’ Hip-Hop
519: 011770 â†’ Hip-Hop
520: 011771 â†’ Hip-Hop
521: 011772 â†’ Hip-Hop
522: 011773 â†’ Hip-Hop
523: 011774 â†’ Hip-Hop
524: 011775 â†’ Hip-Hop
525: 011776 â†’ Hip-Hop
526: 011777 â†’ Hip-Hop
527: 011778 â†’ Hip-Hop
528: 011779 â†’ Hip-Hop
529: 011780 â†’ Hip-Hop

[src/libmpg123/layer3.c:INT123_do_layer3():1908] error: dequantization failed!


903: 021672 â†’ Pop
904: 021676 â†’ Pop
905: 021677 â†’ Pop
906: 021707 â†’ Folk
907: 021774 â†’ Experimental
908: 021842 â†’ Electronic
909: 021859 â†’ Rock
910: 021860 â†’ Rock
911: 021891 â†’ Electronic
912: 021895 â†’ Electronic
913: 021995 â†’ Pop
914: 021996 â†’ Pop
915: 021997 â†’ Pop
916: 021998 â†’ Pop
917: 021999 â†’ Pop
918: 022000 â†’ Pop
919: 022001 â†’ Pop
920: 022088 â†’ Hip-Hop
921: 022091 â†’ Hip-Hop
922: 022093 â†’ Rock
923: 022094 â†’ Hip-Hop
924: 022095 â†’ Electronic
925: 022097 â†’ Pop
926: 022150 â†’ Experimental
927: 022295 â†’ Hip-Hop
928: 022296 â†’ Hip-Hop
929: 022315 â†’ International
930: 022348 â†’ Pop
931: 022472 â†’ Pop
932: 022473 â†’ Pop
933: 022474 â†’ Pop
934: 022475 â†’ Pop
935: 022476 â†’ Pop
936: 022477 â†’ Pop
937: 022478 â†’ Pop
938: 022479 â†’ Pop
939: 022480 â†’ Pop
940: 022481 â†’ Pop
941: 023010 â†’ Electronic
942: 023013 â†’ Electronic
943: 023014 â†’ Folk
944: 023015 â†’ Folk
945: 023016 â†’ Electronic
946: 023037 â†’ Rock
947: 023039 â†’ 

[src/libmpg123/layer3.c:INT123_do_layer3():1908] error: dequantization failed!


1183: 029255 â†’ Pop
1184: 029271 â†’ Pop
1185: 029272 â†’ Experimental
1186: 029350 â†’ International
1187: 029351 â†’ International
1188: 029355 â†’ International
1189: 029465 â†’ Experimental
1190: 029480 â†’ Folk
1191: 029526 â†’ Electronic
1192: 029528 â†’ Electronic
1193: 029530 â†’ Electronic
1194: 029587 â†’ Hip-Hop
1195: 029602 â†’ Hip-Hop
1196: 029673 â†’ Electronic
1197: 029718 â†’ Hip-Hop
1198: 029719 â†’ Hip-Hop
1199: 029720 â†’ Hip-Hop
1200: 029721 â†’ Hip-Hop
1201: 029738 â†’ Experimental
1202: 029739 â†’ Experimental
1203: 029740 â†’ Experimental
1204: 029741 â†’ Experimental
1205: 029742 â†’ Experimental
1206: 029744 â†’ Experimental
1207: 029745 â†’ Experimental
1208: 029746 â†’ Experimental
1209: 029747 â†’ Experimental
1210: 029750 â†’ Experimental
1211: 029752 â†’ Experimental
1212: 029807 â†’ Rock
1213: 029813 â†’ Rock
1214: 029816 â†’ Electronic
1215: 029961 â†’ Hip-Hop
1216: 029971 â†’ Folk
1217: 030041 â†’ Hip-Hop
1218: 030043 â†’ Hip-Hop
1219: 030050 â†’ Hip-H

[src/libmpg123/layer3.c:INT123_do_layer3():1878] error: part2_3_length (3360) too large for available bit count (3240)


2267: 054570 â†’ International
2268: 054576 â†’ International
2269: 054578 â†’ International


[src/libmpg123/layer3.c:INT123_do_layer3():1878] error: part2_3_length (3328) too large for available bit count (3240)


2270: 054580 â†’ International
2271: 054621 â†’ Rock
2272: 054623 â†’ Rock
2273: 054624 â†’ Rock
2274: 054625 â†’ Rock
2275: 054626 â†’ Rock
2276: 054662 â†’ Electronic
2277: 054664 â†’ Pop
2278: 054665 â†’ Pop
2279: 054666 â†’ Pop
2280: 054667 â†’ Pop
2281: 054703 â†’ Folk
2282: 054719 â†’ Electronic
2283: 054735 â†’ Rock
2284: 054753 â†’ Hip-Hop
2285: 054874 â†’ Electronic
2286: 054942 â†’ Rock
2287: 055076 â†’ Pop
2288: 055097 â†’ Hip-Hop
2289: 055100 â†’ Hip-Hop
2290: 055101 â†’ Hip-Hop
2291: 055102 â†’ Hip-Hop
2292: 055113 â†’ Hip-Hop
2293: 055119 â†’ Hip-Hop
2294: 055120 â†’ Hip-Hop
2295: 055121 â†’ Hip-Hop
2296: 055122 â†’ Hip-Hop
2297: 055123 â†’ Hip-Hop
2298: 055124 â†’ Hip-Hop
2299: 055149 â†’ Electronic
2300: 055183 â†’ Electronic
2301: 055186 â†’ Electronic
2302: 055231 â†’ International
2303: 055232 â†’ International
2304: 055233 â†’ International
2305: 055234 â†’ International
2306: 055235 â†’ International
2307: 055236 â†’ International
2308: 055237 â†’ International
230

Note: Illegal Audio-MPEG-Header 0x00000000 at offset 33361.
Note: Trying to resync...
Note: Skipped 1024 bytes in input.
[src/libmpg123/parse.c:wetwork():1389] error: Giving up resync after 1024 bytes - your stream is not nice... (maybe increasing resync limit could help).
  waveform, sr = librosa.load(audio_path, sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


4424: 098565 â†’ Hip-Hop


Note: Illegal Audio-MPEG-Header 0x00000000 at offset 22401.
Note: Trying to resync...
Note: Skipped 1024 bytes in input.
[src/libmpg123/parse.c:wetwork():1389] error: Giving up resync after 1024 bytes - your stream is not nice... (maybe increasing resync limit could help).
  waveform, sr = librosa.load(audio_path, sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


4425: 098567 â†’ Hip-Hop


[src/libmpg123/layer3.c:INT123_do_layer3():1908] error: dequantization failed!
Note: Illegal Audio-MPEG-Header 0x00000000 at offset 63168.
Note: Trying to resync...
Note: Skipped 1024 bytes in input.
[src/libmpg123/parse.c:wetwork():1389] error: Giving up resync after 1024 bytes - your stream is not nice... (maybe increasing resync limit could help).
  waveform, sr = librosa.load(audio_path, sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


4426: 098569 â†’ Hip-Hop
4427: 098573 â†’ Hip-Hop
4428: 098574 â†’ Hip-Hop
4429: 098575 â†’ Hip-Hop
4430: 098576 â†’ Hip-Hop
4431: 098577 â†’ Hip-Hop
4432: 098578 â†’ Hip-Hop
4433: 098579 â†’ Hip-Hop
4434: 098580 â†’ Hip-Hop
4435: 098581 â†’ Hip-Hop
4436: 098582 â†’ Hip-Hop
4437: 098583 â†’ Hip-Hop
4438: 098584 â†’ Hip-Hop
4439: 098585 â†’ Hip-Hop
4440: 098613 â†’ Hip-Hop
4441: 098617 â†’ Hip-Hop
4442: 098618 â†’ Hip-Hop
4443: 098619 â†’ Hip-Hop
4444: 098620 â†’ Hip-Hop
4445: 098621 â†’ Hip-Hop
4446: 098622 â†’ Hip-Hop
4447: 098623 â†’ Hip-Hop
4448: 098624 â†’ Hip-Hop
4449: 098625 â†’ Hip-Hop
4450: 098626 â†’ Hip-Hop
4451: 098627 â†’ Hip-Hop
4452: 098628 â†’ Hip-Hop
4453: 098655 â†’ International
4454: 098656 â†’ International
4455: 098657 â†’ International
4456: 098666 â†’ International
4457: 098667 â†’ International
4458: 098668 â†’ International
4459: 098669 â†’ International
4460: 098670 â†’ International
4461: 098671 â†’ International
4462: 098680 â†’ International
4463: 098681 â†

  waveform, sr = librosa.load(audio_path, sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Error with ../musicData/fma_small/099/099134.mp3: 
4472: 099135 â†’ Electronic
4473: 099214 â†’ Experimental
4474: 099260 â†’ Rock
4475: 099261 â†’ Electronic
4476: 099274 â†’ Experimental
4477: 099311 â†’ Electronic
4478: 099313 â†’ Electronic
4479: 099345 â†’ Rock
4480: 099361 â†’ International
4481: 099362 â†’ International
4482: 099363 â†’ International
4483: 099364 â†’ International
4484: 099368 â†’ International
4485: 099369 â†’ International
4486: 099370 â†’ International
4487: 099371 â†’ International
4488: 099372 â†’ International
4489: 099373 â†’ International
4490: 099374 â†’ International
4491: 099375 â†’ International
4492: 099389 â†’ International
4493: 099390 â†’ International
4494: 099391 â†’ International
4495: 099392 â†’ International
4496: 099393 â†’ International
4497: 099394 â†’ International
4498: 099395 â†’ International
4499: 099411 â†’ Rock
4500: 099419 â†’ Rock
4501: 099436 â†’ Rock
4502: 099437 â†’ Rock
4503: 099438 â†’ Rock
4504: 099439 â†’ Rock
4505: 099440

  waveform, sr = librosa.load(audio_path, sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


4905: 108957 â†’ Rock
4906: 108961 â†’ Rock
4907: 108962 â†’ Rock
4908: 108967 â†’ Rock
4909: 108968 â†’ Rock
4910: 108969 â†’ Rock
4911: 108970 â†’ Rock
4912: 108992 â†’ Rock
4913: 109068 â†’ Experimental
4914: 109071 â†’ Experimental
4915: 109072 â†’ Experimental
4916: 109106 â†’ Experimental
4917: 109144 â†’ Experimental
4918: 109189 â†’ Instrumental
4919: 109191 â†’ Instrumental
4920: 109203 â†’ Rock
4921: 109235 â†’ Rock
4922: 109276 â†’ Experimental
4923: 109349 â†’ Rock
4924: 109350 â†’ Rock
4925: 109355 â†’ Rock
4926: 109356 â†’ Rock
4927: 109357 â†’ Rock
4928: 109445 â†’ Folk
4929: 109446 â†’ Folk
4930: 109447 â†’ Folk
4931: 109448 â†’ Folk
4932: 109449 â†’ Folk
4933: 109450 â†’ Folk
4934: 109468 â†’ Rock
4935: 109480 â†’ Folk
4936: 109481 â†’ Rock
4937: 109497 â†’ Rock
4938: 109535 â†’ Electronic
4939: 109537 â†’ Electronic
4940: 109538 â†’ Electronic
4941: 109542 â†’ Electronic
4942: 109543 â†’ Electronic
4943: 109548 â†’ Experimental
4944: 109670 â†’ Hip-Hop
4945: 109681 â†

  waveform, sr = librosa.load(audio_path, sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Error with ../musicData/fma_small/133/133297.mp3: 
6967: 133332 â†’ Experimental
6968: 133333 â†’ Experimental
6969: 133431 â†’ Instrumental
6970: 133432 â†’ Instrumental
6971: 133433 â†’ Instrumental
6972: 133434 â†’ Instrumental
6973: 133435 â†’ Instrumental
6974: 133436 â†’ Instrumental
6975: 133437 â†’ Instrumental
6976: 133438 â†’ Instrumental
6977: 133439 â†’ Instrumental
6978: 133440 â†’ Instrumental
6979: 133441 â†’ Instrumental
6980: 133442 â†’ Instrumental
6981: 133443 â†’ Instrumental
6982: 133444 â†’ Instrumental
6983: 133445 â†’ Instrumental
6984: 133446 â†’ Instrumental
6985: 133447 â†’ Instrumental
6986: 133448 â†’ Instrumental
6987: 133449 â†’ Instrumental
6988: 133450 â†’ Instrumental
6989: 133451 â†’ Instrumental
6990: 133452 â†’ Instrumental
6991: 133453 â†’ Instrumental
6992: 133454 â†’ Instrumental
6993: 133455 â†’ Instrumental
6994: 133456 â†’ Instrumental
6995: 133457 â†’ Instrumental
6996: 133459 â†’ Instrumental
6997: 133479 â†’ Rock
6998: 133535 â†’ Instrument

In [14]:
#Training using RandomForestClassifier

X = np.load("../features/yamnet_embeddings/X_embeddings.npy")
y = np.load("../features/yamnet_embeddings/y_labels.npy")

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Train model
clf = RandomForestClassifier(n_estimators=200, random_state=42)
clf.fit(X_train, y_train)

# Predict
y_pred = clf.predict(X_test)

os.makedirs("../models", exist_ok=True)
os.makedirs("../reports/Yamnet/", exist_ok=True)

# Save model
joblib.dump(clf, "../models/yamnet_genre_classifier.pkl")

# classification report
report = classification_report(y_test, y_pred)
with open("../reports/Yamnet/classification_report.txt", "w") as f:
    f.write(report)

# Save confusion matrix plot
plt.figure(figsize=(10, 8))
disp = ConfusionMatrixDisplay.from_predictions(y_test, y_pred, xticks_rotation=45)
plt.title("ðŸŽµ YAMNet Genre Classification â€” Confusion Matrix")
plt.tight_layout()
plt.savefig("../reports/Yamnet/confusion_matrix.png")
plt.close()

print("âœ… Model, classification report, and confusion matrix saved.")

  plt.tight_layout()
  plt.savefig("../reports/Yamnet/confusion_matrix.png")


âœ… Model, classification report, and confusion matrix saved.


<Figure size 1000x800 with 0 Axes>