In [1]:
import tensorflow as tf
import numpy as np

print("TensorFlow version:", tf.__version__)
print("NumPy version:", np.__version__)
print("GPU Available:", tf.config.list_physical_devices('GPU'))

2025-05-01 13:02:43.450721: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-01 13:02:43.773507: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


TensorFlow version: 2.12.1
NumPy version: 1.23.5
GPU Available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [10]:
# import os
# os.environ["TF_XLA_FLAGS"] = "--tf_xla_auto_jit=0"


import os
import librosa
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import joblib
import os

In [3]:
# loading Yamnet and setting directories

In [4]:
import os

# Set TFHub cache directory to a local folder you own
os.environ['TFHUB_CACHE_DIR'] = './.tfhub_cache'

In [5]:
AUDIO_DIR = "../musicData/fma_small"
CSV_PATH = "../data/metadata.csv"
OUT_DIR = "../features/yamnet_embeddings"
YAMNET_MODEL = 'https://tfhub.dev/google/yamnet/1'

os.makedirs(OUT_DIR, exist_ok=True)

yamnet = hub.load(YAMNET_MODEL)
print("YAMNet model loaded")

2025-05-01 13:03:14.465527: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 11569 MB memory:  -> device: 0, name: NVIDIA A100 80GB PCIe, pci bus id: 0000:03:00.0, compute capability: 8.0
2025-05-01 13:03:14.866884: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs_3' with dtype int32 and shape [?]
	 [[{{node inputs_3}}]]
2025-05-01 13:03:14.876569: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs_1' with dtype int32 and shape [3]
	 [[{{node inputs_1}}]]
2025-05-01 13:03:14.876699: I tensorflow/core/common_runtime/executor.cc:1197] [/de

YAMNet model loaded


In [7]:
waveform, sr = librosa.load("../musicData/fma_small/000/000002.mp3", sr=16000)
scores, embeddings, spectrogram = yamnet(waveform)

print("Embedding shape:", embeddings.shape)
print("Sample mean vector:", tf.reduce_mean(embeddings, axis=0).numpy()[:10])

2025-05-01 13:06:44.378770: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'waveform' with dtype float and shape [?]
	 [[{{node waveform}}]]
2025-05-01 13:06:45.240556: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:637] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2025-05-01 13:06:45.378550: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:424] Loaded cuDNN version 8907


Embedding shape: (62, 1024)
Sample mean vector: [7.1898277e-04 2.6383866e-03 6.1445320e-01 1.8966133e-02 2.5788273e-03
 0.0000000e+00 1.5048351e-04 1.3661080e-02 1.3728974e-03 5.5781973e-04]


In [9]:
# Extract embeddings
def extract_embedding(audio_path):
    waveform, sr = librosa.load(audio_path, sr=16000)
    _, embeddings, _ = yamnet(waveform)
    return tf.reduce_mean(embeddings, axis=0).numpy()

df = pd.read_csv(CSV_PATH)
X = []
y = []

for i, row in df.iterrows():
    track_id = str(row['track_id']).zfill(6)
    genre = row['genre']
    folder = track_id[:3]
    filepath = os.path.join(AUDIO_DIR, folder, f"{track_id}.mp3")

    if not os.path.exists(filepath):
        continue

    try:
        emb = extract_embedding(filepath)
        X.append(emb)
        y.append(genre)
        print(f"{i+1}: {track_id} → {genre}")
    except Exception as e:
        print(f"Error with {filepath}: {e}")

os.makedirs("../features/yamnet_embeddings", exist_ok=True)
np.save("../features/yamnet_embeddings/X_embeddings.npy", np.array(X))
np.save("../features/yamnet_embeddings/y_labels.npy", np.array(y))
print("Saved X_embeddings.npy and y_labels.npy!")

1: 000002 → Hip-Hop
2: 000005 → Hip-Hop
3: 000010 → Pop
4: 000140 → Folk
5: 000141 → Folk
6: 000148 → Experimental
7: 000182 → Rock
8: 000190 → Folk
9: 000193 → Folk
10: 000194 → Folk
11: 000197 → Folk
12: 000200 → Folk
13: 000203 → Folk
14: 000204 → Folk
15: 000207 → Folk
16: 000210 → Folk
17: 000211 → Folk
18: 000212 → Folk
19: 000213 → Pop
20: 000255 → Rock
21: 000256 → Rock
22: 000368 → Rock
23: 000424 → Experimental
24: 000459 → Rock
25: 000534 → Folk
26: 000540 → Folk
27: 000546 → Folk
28: 000574 → Rock
29: 000602 → Folk
30: 000615 → Experimental
31: 000620 → Folk
32: 000621 → Folk
33: 000625 → Folk
34: 000666 → International
35: 000667 → International
36: 000676 → Hip-Hop
37: 000690 → Rock
38: 000694 → Hip-Hop
39: 000695 → Hip-Hop
40: 000704 → International
41: 000705 → International
42: 000706 → International
43: 000707 → International
44: 000708 → International
45: 000709 → International
46: 000714 → Folk
47: 000715 → Folk
48: 000716 → Folk
49: 000718 → Folk
50: 000777 → Rock


[src/libmpg123/layer3.c:INT123_do_layer3():1948] error: dequantization failed!


491: 011298 → Hip-Hop
492: 011299 → Hip-Hop
493: 011306 → Rock
494: 011333 → Experimental
495: 011334 → Experimental
496: 011503 → Folk
497: 011504 → Folk
498: 011505 → Folk
499: 011508 → Folk
500: 011544 → Experimental
501: 011638 → Instrumental
502: 011671 → International
503: 011672 → International
504: 011673 → International
505: 011674 → International
506: 011675 → International
507: 011677 → International
508: 011679 → International
509: 011681 → International
510: 011682 → International
511: 011683 → International
512: 011763 → Hip-Hop
513: 011764 → Hip-Hop
514: 011765 → Hip-Hop
515: 011766 → Hip-Hop
516: 011767 → Hip-Hop
517: 011768 → Hip-Hop
518: 011769 → Hip-Hop
519: 011770 → Hip-Hop
520: 011771 → Hip-Hop
521: 011772 → Hip-Hop
522: 011773 → Hip-Hop
523: 011774 → Hip-Hop
524: 011775 → Hip-Hop
525: 011776 → Hip-Hop
526: 011777 → Hip-Hop
527: 011778 → Hip-Hop
528: 011779 → Hip-Hop
529: 011780 → Hip-Hop
530: 011781 → Hip-Hop
531: 011782 → Hip-Hop
532: 011783 → Hip-Hop
533: 011784

[src/libmpg123/layer3.c:INT123_do_layer3():1908] error: dequantization failed!


903: 021672 → Pop
904: 021676 → Pop
905: 021677 → Pop
906: 021707 → Folk
907: 021774 → Experimental
908: 021842 → Electronic
909: 021859 → Rock
910: 021860 → Rock
911: 021891 → Electronic
912: 021895 → Electronic
913: 021995 → Pop
914: 021996 → Pop
915: 021997 → Pop
916: 021998 → Pop
917: 021999 → Pop
918: 022000 → Pop
919: 022001 → Pop
920: 022088 → Hip-Hop
921: 022091 → Hip-Hop
922: 022093 → Rock
923: 022094 → Hip-Hop
924: 022095 → Electronic
925: 022097 → Pop
926: 022150 → Experimental
927: 022295 → Hip-Hop
928: 022296 → Hip-Hop
929: 022315 → International
930: 022348 → Pop
931: 022472 → Pop
932: 022473 → Pop
933: 022474 → Pop
934: 022475 → Pop
935: 022476 → Pop
936: 022477 → Pop
937: 022478 → Pop
938: 022479 → Pop
939: 022480 → Pop
940: 022481 → Pop
941: 023010 → Electronic
942: 023013 → Electronic
943: 023014 → Folk
944: 023015 → Folk
945: 023016 → Electronic
946: 023037 → Rock
947: 023039 → Pop
948: 023041 → Rock
949: 023063 → Experimental
950: 023155 → International
951: 023156 

[src/libmpg123/layer3.c:INT123_do_layer3():1908] error: dequantization failed!


1183: 029255 → Pop
1184: 029271 → Pop
1185: 029272 → Experimental
1186: 029350 → International
1187: 029351 → International
1188: 029355 → International
1189: 029465 → Experimental
1190: 029480 → Folk
1191: 029526 → Electronic
1192: 029528 → Electronic
1193: 029530 → Electronic
1194: 029587 → Hip-Hop
1195: 029602 → Hip-Hop
1196: 029673 → Electronic
1197: 029718 → Hip-Hop
1198: 029719 → Hip-Hop
1199: 029720 → Hip-Hop
1200: 029721 → Hip-Hop
1201: 029738 → Experimental
1202: 029739 → Experimental
1203: 029740 → Experimental
1204: 029741 → Experimental
1205: 029742 → Experimental
1206: 029744 → Experimental
1207: 029745 → Experimental
1208: 029746 → Experimental
1209: 029747 → Experimental
1210: 029750 → Experimental
1211: 029752 → Experimental
1212: 029807 → Rock
1213: 029813 → Rock
1214: 029816 → Electronic
1215: 029961 → Hip-Hop
1216: 029971 → Folk
1217: 030041 → Hip-Hop
1218: 030043 → Hip-Hop
1219: 030050 → Hip-Hop
1220: 030056 → Hip-Hop
1221: 030058 → Hip-Hop
1222: 030059 → Hip-Hop
12

[src/libmpg123/layer3.c:INT123_do_layer3():1878] error: part2_3_length (3360) too large for available bit count (3240)


2267: 054570 → International
2268: 054576 → International
2269: 054578 → International


[src/libmpg123/layer3.c:INT123_do_layer3():1878] error: part2_3_length (3328) too large for available bit count (3240)


2270: 054580 → International
2271: 054621 → Rock
2272: 054623 → Rock
2273: 054624 → Rock
2274: 054625 → Rock
2275: 054626 → Rock
2276: 054662 → Electronic
2277: 054664 → Pop
2278: 054665 → Pop
2279: 054666 → Pop
2280: 054667 → Pop
2281: 054703 → Folk
2282: 054719 → Electronic
2283: 054735 → Rock
2284: 054753 → Hip-Hop
2285: 054874 → Electronic
2286: 054942 → Rock
2287: 055076 → Pop
2288: 055097 → Hip-Hop
2289: 055100 → Hip-Hop
2290: 055101 → Hip-Hop
2291: 055102 → Hip-Hop
2292: 055113 → Hip-Hop
2293: 055119 → Hip-Hop
2294: 055120 → Hip-Hop
2295: 055121 → Hip-Hop
2296: 055122 → Hip-Hop
2297: 055123 → Hip-Hop
2298: 055124 → Hip-Hop
2299: 055149 → Electronic
2300: 055183 → Electronic
2301: 055186 → Electronic
2302: 055231 → International
2303: 055232 → International
2304: 055233 → International
2305: 055234 → International
2306: 055235 → International
2307: 055236 → International
2308: 055237 → International
2309: 055238 → International
2310: 055240 → International
2311: 055241 → Internat

Note: Illegal Audio-MPEG-Header 0x00000000 at offset 33361.
Note: Trying to resync...
Note: Skipped 1024 bytes in input.
[src/libmpg123/parse.c:wetwork():1389] error: Giving up resync after 1024 bytes - your stream is not nice... (maybe increasing resync limit could help).
  waveform, sr = librosa.load(audio_path, sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


4424: 098565 → Hip-Hop


Note: Illegal Audio-MPEG-Header 0x00000000 at offset 22401.
Note: Trying to resync...
Note: Skipped 1024 bytes in input.
[src/libmpg123/parse.c:wetwork():1389] error: Giving up resync after 1024 bytes - your stream is not nice... (maybe increasing resync limit could help).
  waveform, sr = librosa.load(audio_path, sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


4425: 098567 → Hip-Hop


[src/libmpg123/layer3.c:INT123_do_layer3():1908] error: dequantization failed!
Note: Illegal Audio-MPEG-Header 0x00000000 at offset 63168.
Note: Trying to resync...
Note: Skipped 1024 bytes in input.
[src/libmpg123/parse.c:wetwork():1389] error: Giving up resync after 1024 bytes - your stream is not nice... (maybe increasing resync limit could help).
  waveform, sr = librosa.load(audio_path, sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


4426: 098569 → Hip-Hop
4427: 098573 → Hip-Hop
4428: 098574 → Hip-Hop
4429: 098575 → Hip-Hop
4430: 098576 → Hip-Hop
4431: 098577 → Hip-Hop
4432: 098578 → Hip-Hop
4433: 098579 → Hip-Hop
4434: 098580 → Hip-Hop
4435: 098581 → Hip-Hop
4436: 098582 → Hip-Hop
4437: 098583 → Hip-Hop
4438: 098584 → Hip-Hop
4439: 098585 → Hip-Hop
4440: 098613 → Hip-Hop
4441: 098617 → Hip-Hop
4442: 098618 → Hip-Hop
4443: 098619 → Hip-Hop
4444: 098620 → Hip-Hop
4445: 098621 → Hip-Hop
4446: 098622 → Hip-Hop
4447: 098623 → Hip-Hop
4448: 098624 → Hip-Hop
4449: 098625 → Hip-Hop
4450: 098626 → Hip-Hop
4451: 098627 → Hip-Hop
4452: 098628 → Hip-Hop
4453: 098655 → International
4454: 098656 → International
4455: 098657 → International
4456: 098666 → International
4457: 098667 → International
4458: 098668 → International
4459: 098669 → International
4460: 098670 → International
4461: 098671 → International
4462: 098680 → International
4463: 098681 → International
4464: 098701 → Rock
4465: 098770 → Electronic
4466: 098838 →

  waveform, sr = librosa.load(audio_path, sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Error with ../musicData/fma_small/099/099134.mp3: 
4472: 099135 → Electronic
4473: 099214 → Experimental
4474: 099260 → Rock
4475: 099261 → Electronic
4476: 099274 → Experimental
4477: 099311 → Electronic
4478: 099313 → Electronic
4479: 099345 → Rock
4480: 099361 → International
4481: 099362 → International
4482: 099363 → International
4483: 099364 → International
4484: 099368 → International
4485: 099369 → International
4486: 099370 → International
4487: 099371 → International
4488: 099372 → International
4489: 099373 → International
4490: 099374 → International
4491: 099375 → International
4492: 099389 → International
4493: 099390 → International
4494: 099391 → International
4495: 099392 → International
4496: 099393 → International
4497: 099394 → International
4498: 099395 → International
4499: 099411 → Rock
4500: 099419 → Rock
4501: 099436 → Rock
4502: 099437 → Rock
4503: 099438 → Rock
4504: 099439 → Rock
4505: 099440 → Rock
4506: 099441 → Rock
4507: 099442 → Rock
4508: 099501 → Hip

  waveform, sr = librosa.load(audio_path, sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


4905: 108957 → Rock
4906: 108961 → Rock
4907: 108962 → Rock
4908: 108967 → Rock
4909: 108968 → Rock
4910: 108969 → Rock
4911: 108970 → Rock
4912: 108992 → Rock
4913: 109068 → Experimental
4914: 109071 → Experimental
4915: 109072 → Experimental
4916: 109106 → Experimental
4917: 109144 → Experimental
4918: 109189 → Instrumental
4919: 109191 → Instrumental
4920: 109203 → Rock
4921: 109235 → Rock
4922: 109276 → Experimental
4923: 109349 → Rock
4924: 109350 → Rock
4925: 109355 → Rock
4926: 109356 → Rock
4927: 109357 → Rock
4928: 109445 → Folk
4929: 109446 → Folk
4930: 109447 → Folk
4931: 109448 → Folk
4932: 109449 → Folk
4933: 109450 → Folk
4934: 109468 → Rock
4935: 109480 → Folk
4936: 109481 → Rock
4937: 109497 → Rock
4938: 109535 → Electronic
4939: 109537 → Electronic
4940: 109538 → Electronic
4941: 109542 → Electronic
4942: 109543 → Electronic
4943: 109548 → Experimental
4944: 109670 → Hip-Hop
4945: 109681 → Hip-Hop
4946: 109684 → Hip-Hop
4947: 109685 → Hip-Hop
4948: 109686 → Hip-Hop
494

  waveform, sr = librosa.load(audio_path, sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Error with ../musicData/fma_small/133/133297.mp3: 
6967: 133332 → Experimental
6968: 133333 → Experimental
6969: 133431 → Instrumental
6970: 133432 → Instrumental
6971: 133433 → Instrumental
6972: 133434 → Instrumental
6973: 133435 → Instrumental
6974: 133436 → Instrumental
6975: 133437 → Instrumental
6976: 133438 → Instrumental
6977: 133439 → Instrumental
6978: 133440 → Instrumental
6979: 133441 → Instrumental
6980: 133442 → Instrumental
6981: 133443 → Instrumental
6982: 133444 → Instrumental
6983: 133445 → Instrumental
6984: 133446 → Instrumental
6985: 133447 → Instrumental
6986: 133448 → Instrumental
6987: 133449 → Instrumental
6988: 133450 → Instrumental
6989: 133451 → Instrumental
6990: 133452 → Instrumental
6991: 133453 → Instrumental
6992: 133454 → Instrumental
6993: 133455 → Instrumental
6994: 133456 → Instrumental
6995: 133457 → Instrumental
6996: 133459 → Instrumental
6997: 133479 → Rock
6998: 133535 → Instrumental
6999: 133537 → Instrumental
7000: 133538 → Instrumental
7001:

In [14]:
#Training using RandomForestClassifier

X = np.load("../features/yamnet_embeddings/X_embeddings.npy")
y = np.load("../features/yamnet_embeddings/y_labels.npy")

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Train model
clf = RandomForestClassifier(n_estimators=200, random_state=42)
clf.fit(X_train, y_train)

# Predict
y_pred = clf.predict(X_test)

os.makedirs("../models", exist_ok=True)
os.makedirs("../reports/Yamnet/", exist_ok=True)

# Save model
joblib.dump(clf, "../models/yamnet_genre_classifier.pkl")

# classification report
report = classification_report(y_test, y_pred)
with open("../reports/Yamnet/classification_report.txt", "w") as f:
    f.write(report)

# Save confusion matrix plot
plt.figure(figsize=(10, 8))
disp = ConfusionMatrixDisplay.from_predictions(y_test, y_pred, xticks_rotation=45)
plt.title("🎵 YAMNet Genre Classification — Confusion Matrix")
plt.tight_layout()
plt.savefig("../reports/Yamnet/confusion_matrix.png")
plt.close()

print("✅ Model, classification report, and confusion matrix saved.")

  plt.tight_layout()
  plt.savefig("../reports/Yamnet/confusion_matrix.png")


✅ Model, classification report, and confusion matrix saved.


<Figure size 1000x800 with 0 Axes>