In [None]:
!pip install kaggle pretty_midi torch torchvision torchaudio scikit-learn matplotlib tqdm > /dev/null

from google.colab import drive
drive.mount('/content/drive')

import os, re, json, glob, zipfile, random, warnings
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# -------------------------
# 1) Konfigurasi path & unduh ZIP
# -------------------------
FILE_ID = "1VPx3nKsEPR-GV9v0QiNZRbA_Ptdt7lsA"   # ID ZIP gabungan (punyamu)
ZIP_PATH = Path("/content/dataset_mirex.zip")
EXTRACT_DIR = Path("/content/dataset_mirex")    # root ekstraksi

SAVE_DIR = Path("/content/drive/MyDrive/dataset_mirex")
SAVE_DIR.mkdir(parents=True, exist_ok=True)

import gdown
if not ZIP_PATH.exists():
    print("Downloading ZIP via gdown...")
    gdown.download(id=FILE_ID, output=str(ZIP_PATH), quiet=False)
else:
    print("ZIP already exists:", ZIP_PATH)

print("Extracting ZIP ...")
EXTRACT_DIR.mkdir(parents=True, exist_ok=True)
with zipfile.ZipFile(ZIP_PATH, 'r') as zf:
    zf.extractall(EXTRACT_DIR)

print("Done extract.")
print("EXTRACT_DIR content:")
for p in EXTRACT_DIR.iterdir():
    print(" -", p)


Mounted at /content/drive
Downloading ZIP via gdown...


Downloading...
From (original): https://drive.google.com/uc?id=1VPx3nKsEPR-GV9v0QiNZRbA_Ptdt7lsA
From (redirected): https://drive.google.com/uc?id=1VPx3nKsEPR-GV9v0QiNZRbA_Ptdt7lsA&confirm=t&uuid=8bf0816d-510b-4c49-8e0d-91c74e42ede0
To: /content/dataset_mirex.zip
100%|██████████| 320M/320M [00:05<00:00, 55.4MB/s]


Extracting ZIP ...
Done extract.
EXTRACT_DIR content:
 - /content/dataset_mirex/dataset
 - /content/dataset_mirex/README.txt


In [None]:
from pathlib import Path
import os

EXTRACT_DIR = Path("/content/dataset_mirex")

# Cari folder yang punya subfolder Audio, Lyrics, MIDIs
candidates = []
for root, dirs, files in os.walk(EXTRACT_DIR):
    dirs_set = set(dirs)
    if {"Audio", "Lyrics", "MIDIs"}.issubset(dirs_set):
        candidates.append(Path(root))

if not candidates:
    raise RuntimeError("Gak nemu folder yang berisi Audio, Lyrics, MIDIs di bawah /content/dataset_mirex")

ROOT = candidates[0]
AUDIO_DIR = ROOT / "Audio"
LYRIC_DIR = ROOT / "Lyrics"
MIDI_DIR  = ROOT / "MIDIs"

print("ROOT terdeteksi :", ROOT)
print("AUDIO_DIR       :", AUDIO_DIR)
print("LYRIC_DIR       :", LYRIC_DIR)
print("MIDI_DIR        :", MIDI_DIR)

# Cari file .bat di bawah ROOT
BAT_FILES = sorted(ROOT.rglob("split-by-categories*.bat"))
print("\nBAT_FILES ditemukan:", len(BAT_FILES))
for b in BAT_FILES:
    print(" -", b)


ROOT terdeteksi : /content/dataset_mirex/dataset
AUDIO_DIR       : /content/dataset_mirex/dataset/Audio
LYRIC_DIR       : /content/dataset_mirex/dataset/Lyrics
MIDI_DIR        : /content/dataset_mirex/dataset/MIDIs

BAT_FILES ditemukan: 3
 - /content/dataset_mirex/dataset/split-by-categories-audio.bat
 - /content/dataset_mirex/dataset/split-by-categories-lyrics.bat
 - /content/dataset_mirex/dataset/split-by-categories-midi.bat


In [None]:
import re
import pandas as pd

def parse_bat_labels(bat_path: Path):
    rows = []
    # contoh baris yang diharapkan:
    # move 001.mp3 "Cluster 1\Boisterous\"
    # move 002.txt "Cluster 2\Aggressive\"
    pat = re.compile(
        r'move\s+(\d+)\.(mp3|txt|mid)\s+"([^\\"]+)\\([^\\"]+)',
        re.IGNORECASE
    )
    with open(bat_path, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            line = line.strip()
            m = pat.search(line)
            if m:
                file_id_raw, ext, cluster, subcat = m.groups()
                file_id = f"{int(file_id_raw):03d}"  # 1 -> 001
                ext = ext.lower()
                cluster = cluster.strip()
                subcat = subcat.strip()
                rows.append({
                    "file_id": file_id,
                    "ext": ext,
                    "cluster": cluster,   # "Cluster 1"
                    "subcat": subcat,     # "Boisterous"
                    "label_cluster": cluster,
                    "label_full": f"{cluster} / {subcat}",
                })
    return pd.DataFrame(rows)

labels_all_df = pd.DataFrame(columns=["file_id","ext","cluster","subcat","label_cluster","label_full"])

for bat in BAT_FILES:
    df_bat = parse_bat_labels(bat)
    print(f"{bat.name} -> {len(df_bat)} baris hasil parse")
    if not df_bat.empty:
        labels_all_df = pd.concat([labels_all_df, df_bat], ignore_index=True)

print("\nTotal labels_all_df:", labels_all_df.shape)
labels_all_df.head()


split-by-categories-audio.bat -> 903 baris hasil parse
split-by-categories-lyrics.bat -> 903 baris hasil parse
split-by-categories-midi.bat -> 903 baris hasil parse

Total labels_all_df: (2709, 6)


Unnamed: 0,file_id,ext,cluster,subcat,label_cluster,label_full
0,1,mp3,Cluster 1,Boisterous,Cluster 1,Cluster 1 / Boisterous
1,2,mp3,Cluster 1,Boisterous,Cluster 1,Cluster 1 / Boisterous
2,3,mp3,Cluster 1,Boisterous,Cluster 1,Cluster 1 / Boisterous
3,4,mp3,Cluster 1,Boisterous,Cluster 1,Cluster 1 / Boisterous
4,5,mp3,Cluster 1,Boisterous,Cluster 1,Cluster 1 / Boisterous


In [None]:
# Buang duplikat identik dulu
labels_all_df = labels_all_df.drop_duplicates(
    subset=["file_id", "ext", "label_full"]
).reset_index(drop=True)

# Prioritas: mp3 > txt > mid
prio_map = {"mp3": 0, "txt": 1, "mid": 2}
labels_all_df["ext_prio"] = labels_all_df["ext"].map(prio_map).fillna(99)

labels_all_df_sorted = labels_all_df.sort_values(["file_id", "ext_prio"])

# Ambil baris pertama per file_id (prioritas tertinggi)
main_labels = labels_all_df_sorted.drop_duplicates(subset=["file_id"], keep="first")
main_labels = main_labels[["file_id", "label_cluster", "label_full"]].reset_index(drop=True)

print("unique file IDs dengan label:", len(main_labels))
main_labels.head()


unique file IDs dengan label: 903


Unnamed: 0,file_id,label_cluster,label_full
0,1,Cluster 1,Cluster 1 / Boisterous
1,2,Cluster 1,Cluster 1 / Boisterous
2,3,Cluster 1,Cluster 1 / Boisterous
3,4,Cluster 1,Cluster 1 / Boisterous
4,5,Cluster 1,Cluster 1 / Boisterous


In [None]:
# Emosi unik (Cluster 1..5)
emotion_classes = sorted(main_labels["label_cluster"].unique())
print("Emotion classes:", emotion_classes)

# Map ke index 0..4 dan id 1..5
emotion_to_idx = {emo: i for i, emo in enumerate(emotion_classes)}  # "Cluster 1" -> 0
idx_to_emotion = {i: emo for emo, i in emotion_to_idx.items()}

master = main_labels.copy()
master["emotion"]   = master["label_cluster"]
master["label_idx"] = master["emotion"].map(emotion_to_idx)   # 0..4
master["label_id"]  = master["label_idx"] + 1                 # 1..5
master["file_id_norm"] = master["file_id"]

# Cek file yang benar-benar ada di folder Audio/Lyrics/MIDIs
audio_ids  = {os.path.splitext(f)[0] for f in os.listdir(AUDIO_DIR) if f.lower().endswith(".mp3")}
lyrics_ids = {os.path.splitext(f)[0] for f in os.listdir(LYRIC_DIR) if f.lower().endswith(".txt")}
midi_ids   = {os.path.splitext(f)[0] for f in os.listdir(MIDI_DIR)  if f.lower().endswith(".mid")}

print("Jumlah file audio :", len(audio_ids))
print("Jumlah file lyrics:", len(lyrics_ids))
print("Jumlah file midi  :", len(midi_ids))

master["has_audio"]  = master["file_id"].isin(audio_ids).astype(int)
master["has_lyrics"] = master["file_id"].isin(lyrics_ids).astype(int)
master["has_midi"]   = master["file_id"].isin(midi_ids).astype(int)

master_tracks = master[[
    "file_id", "file_id_norm", "emotion",
    "label_id", "label_idx",
    "has_audio", "has_lyrics", "has_midi"
]].copy()

print("master_tracks shape:", master_tracks.shape)
master_tracks.head()


Emotion classes: ['Cluster 1', 'Cluster 2', 'Cluster 3', 'Cluster 4', 'Cluster 5']
Jumlah file audio : 903
Jumlah file lyrics: 764
Jumlah file midi  : 196
master_tracks shape: (903, 8)


Unnamed: 0,file_id,file_id_norm,emotion,label_id,label_idx,has_audio,has_lyrics,has_midi
0,1,1,Cluster 1,1,0,1,1,0
1,2,2,Cluster 1,1,0,1,0,0
2,3,3,Cluster 1,1,0,1,1,0
3,4,4,Cluster 1,1,0,1,1,1
4,5,5,Cluster 1,1,0,1,0,0


In [None]:
DATASET_ROOT = EXTRACT_DIR  # = /content/dataset_mirex

out_master = DATASET_ROOT / "master_tracks.csv"
master_tracks.to_csv(out_master, index=False)
print("✅ master_tracks.csv disimpan di:", out_master)

print("\nDistribusi label_id (1..5) di seluruh lagu:")
print(master_tracks["label_id"].value_counts().sort_index())

print("\nJumlah lagu yang punya tiap modalitas:")
print(master_tracks[["has_audio", "has_lyrics", "has_midi"]].sum())


✅ master_tracks.csv disimpan di: /content/dataset_mirex/master_tracks.csv

Distribusi label_id (1..5) di seluruh lagu:
label_id
1    170
2    164
3    215
4    191
5    163
Name: count, dtype: int64

Jumlah lagu yang punya tiap modalitas:
has_audio     903
has_lyrics    764
has_midi      196
dtype: int64


In [None]:
from sklearn.model_selection import train_test_split

df = master_tracks.copy()

# (opsional) buang lagu yang benar-benar tidak punya modalitas
df = df[(df["has_audio"]==1) | (df["has_lyrics"]==1) | (df["has_midi"]==1)].reset_index(drop=True)

train_df, temp_df = train_test_split(
    df,
    test_size=0.20,         # 20% total untuk val+test
    random_state=42,
    stratify=df["label_id"]
)

val_df, test_df = train_test_split(
    temp_df,
    test_size=0.50,         # 10% val, 10% test
    random_state=42,
    stratify=temp_df["label_id"]
)

print("Total:", len(df))
print("Train:", len(train_df), "Val:", len(val_df), "Test:", len(test_df))

print("\nDistribusi label_id per split:")
print("Train:\n", train_df["label_id"].value_counts().sort_index())
print("Val:\n",   val_df["label_id"].value_counts().sort_index())
print("Test:\n",  test_df["label_id"].value_counts().sort_index())


Total: 903
Train: 722 Val: 90 Test: 91

Distribusi label_id per split:
Train:
 label_id
1    136
2    131
3    172
4    153
5    130
Name: count, dtype: int64
Val:
 label_id
1    17
2    16
3    21
4    19
5    17
Name: count, dtype: int64
Test:
 label_id
1    17
2    17
3    22
4    19
5    16
Name: count, dtype: int64


In [None]:
train_split = train_df[["file_id_norm"]].copy()
train_split["split"] = "train"
val_split = val_df[["file_id_norm"]].copy()
val_split["split"] = "val"
test_split = test_df[["file_id_norm"]].copy()
test_split["split"] = "test"

split_global = pd.concat([train_split, val_split, test_split], ignore_index=True)
split_global.rename(columns={"file_id_norm": "id"}, inplace=True)

out_split = DATASET_ROOT / "split_global.csv"
split_global.to_csv(out_split, index=False)
print("✅ split_global.csv disimpan di:", out_split)
split_global.head()


✅ split_global.csv disimpan di: /content/dataset_mirex/split_global.csv


Unnamed: 0,id,split
0,56,train
1,303,train
2,386,train
3,469,train
4,268,train


In [None]:
import shutil
from pathlib import Path

# Lokasi di runtime Colab (lokal, yang sekarang sudah ada)
src_master = Path("/content/dataset_mirex/master_tracks.csv")
src_split  = Path("/content/dataset_mirex/split_global.csv")

# Lokasi di Google Drive, folder yang kelihatan di screenshotmu
drive_root = Path("/content/drive/MyDrive/dataset_mirex")

# Pastikan foldernya ada
drive_root.mkdir(parents=True, exist_ok=True)

# Copy file ke Drive
shutil.copy(src_master, drive_root / "master_tracks.csv")
shutil.copy(src_split,  drive_root / "split_global.csv")

print("Copied to:", drive_root)
!ls -l "/content/drive/MyDrive/dataset_mirex"


Copied to: /content/drive/MyDrive/dataset_mirex
total 1939
-rw------- 1 root root 964824 Nov 26 11:12 EDA_multimodal_20251126-111258.zip
-rw------- 1 root root 964824 Nov 26 15:20 EDA_multimodal_20251126-152053.zip
drwx------ 2 root root   4096 Nov 26 09:18 eda_outputs
-rw------- 1 root root  25362 Dec 15 05:55 master_tracks.csv
drwx------ 2 root root   4096 Dec 10 02:25 miditrain
drwx------ 2 root root   4096 Dec 15 01:55 miditrain2
drwx------ 2 root root   4096 Dec 10 06:32 miditrainsvm
drwx------ 2 root root   4096 Dec 15 05:50 miditrainX
-rw------- 1 root root   8768 Dec 15 05:55 split_global.csv
