In [42]:
!pip install numpy
!pip install soundfile
!pip install jams
!pip install librosa


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Collecting librosa
  Using cached librosa-0.11.0-py3-none-any.whl.metadata (8.7 kB)
Collecting audioread>=2.1.9 (from librosa)
  Using cached audioread-3.1.0-py3-none-any.whl.metadata (9.0 kB)
Collecting numba>=0.51.0 (from librosa)
  Using cached

In [43]:
import random
from pathlib import Path
import numpy as np
import soundfile as sf
import jams
import librosa

In [36]:
# Setting up paths
cdw = Path.cwd()
root = cdw.parents[0]
audio_dir = root / "dataset" / "audio"
jams_dir = root / "dataset" / "annotation"
output_dir = root / "processed_data"
output_dir.mkdir(exist_ok=True, parents=True)
wav_files = sorted(list(audio_dir.glob("*.wav"))) # Gets all .wav files in this directory
if not wav_files:
    print("No wav files found. Need to create audio directory with wave files")
random.seed(42)
random.shuffle(wav_files)

In [67]:
STRING_INFO = [
    ("E2", 40), ("A2", 45), ("D3", 50), ("G3", 55), ("B3", 59), ("E4", 64),
]
HOP_LENGTH = 512
WINDOW_SIZE = 128
SR = 44100
N_FFT=2048
STRIDE = 128

In [68]:
# Splitting into train/test
split_idx = int(len(wav_files) * 0.8)
train_files = wav_files[:split_idx]
test_files = wav_files[split_idx:]
print(f"Total Songs: {len(wav_files)}")
print(f"Train: {len(train_files)} | Test: {len(test_files)}")

Total Songs: 357
Train: 285 | Test: 72


In [69]:


def extract_guitar_tabs_from_jam(jam):
    note_anns = jam.search(namespace="note_midi") # We only need this node for guitars
    events = []

    for string_idx, (ann, (string_name, open_midi)) in enumerate(zip(note_anns, STRING_INFO), start=1):
        # String matched up to list just by order of string. Base string is 1 is E2
        for obs in ann:
            midi = float(obs.value)
            midi_rounded = round(midi)
            fret = midi_rounded - open_midi # This gives us the fret value. open_midi is derived from string info

            # Skip negative frets (errors in annotation)
            if fret < 0: continue

            events.append({
                "start": obs.time,
                "end": obs.time + obs.duration,
                "string_index": string_idx,
                "fret": int(fret),
            })
    # Sort chronologically
    events.sort(key=lambda e: e["start"]) # Sorting by start time (multiple strings can be played at once)
    return events

def create_label_matrix(events, total_frames, sr, hop_length):
    # 21 represents Silence / No Note
    y = np.full((6, total_frames), 21, dtype=int)

    for event in events:
        # JAMS uses 1-6 for strings, we need 0-5 for array indexing
        string_idx = event["string_index"] - 1 
        fret = event["fret"]
        
        # Convert Seconds -> Frames
        start_frame = librosa.time_to_frames(event["start"], sr=sr, hop_length=hop_length)
        end_frame = librosa.time_to_frames(event["end"], sr=sr, hop_length=hop_length)

        # Bounds check
        start_frame = max(0, start_frame)
        end_frame = min(total_frames, end_frame)

        if start_frame < end_frame:
            y[string_idx, start_frame:end_frame] = fret
    return y

def process_track(audio_path, jams_path):
    audio, sr = sf.read(str(audio_path)) # Reading the sound file. Again this is the .wav file so we are going to get a bunch of data points
    # audio.shape -> # datapoints, 6 (6 strings/channels)
    # print(audio.shape)
    if audio.ndim > 1 and audio.shape[0] > audio.shape[1]:
        audio = audio.T # Librosa expects strings, samples not samples, strings that sf.read gives
    # audio data obtained
    
    jam = jams.load(str(jams_path))
    # Need to get labels (string data) derived from jam file
    events = extract_guitar_tabs_from_jam(jam)
    return audio, sr, events

def slice_data(features, labels, window_size, stride):
    X_chunks = []
    y_chunks = []
    
    total_frames = features.shape[2]
    
    for t in range(0, total_frames - window_size, stride):
        # Slice X: (6 strings, 128 freqs, window_time)
        x_slice = features[:, :, t : t + window_size]
        
        # Slice y: (6 strings, window_time)
        y_slice = labels[:, t : t + window_size]
        
        X_chunks.append(x_slice)
        y_chunks.append(y_slice)
        
    return X_chunks, y_chunks

In [70]:
# Creating loop to process datasets
def build_dataset(file_list):
    X = []
    y = []

    for i, wav_file in enumerate(file_list):
        jams_name = wav_file.stem.split("_hex_cln")[0] + ".jams" # Naming convention for files is different
        matching_jams_path = jams_dir / jams_name
        if not matching_jams_path.exists():
            continue # Continue if for some reason there is no corresponding jam file
        # Our goal is to get the raw data for the audio

    return np.array(X), np.array(y)

In [79]:


def build_dataset(file_list):
  X_master, y_master = [], []
  for i, wav_file in enumerate(file_list):
    jams_name = wav_file.stem.split("_hex_cln")[0] + ".jams" # Naming convention for files is different
    matching_jams_path = jams_dir / jams_name
    audio, sr, events = process_track(wav_file, matching_jams_path)
    # We now have events (which is going to become our y values) we need to make X and y values offically now.
    # We are going to do this by taking the signal data and converting it into spectogram data raw audio -> spectogram
    spect = librosa.feature.melspectrogram(y=audio, sr=SR, n_fft=N_FFT, hop_length = HOP_LENGTH) # n_fft is length of FFT window, hop length is how far window moves onward
    # # Computing mel spectrograms
    spect = librosa.power_to_db(spect, ref=np.max) # ref = np.max sets baseline (zero) to loudest sound. Everything else is negative
    # Mel spectograms will be our data points now need labels
    # print(spect.shape)  # (6, 128, 2827) strings, window, # frames
    num_time_frames = spect.shape[2]
    labels = create_label_matrix(events, num_time_frames, SR, HOP_LENGTH) # Created labels
    X_c, y_c = slice_data(spect, labels, WINDOW_SIZE, STRIDE) # Want to break up songs into smaller chunks
    X_master.extend(X_c)
    y_master.extend(y_c)
    if i % 10 == 0:
      print(f"  Processed {i}/{len(file_list)}...")
  return np.array(X_master), np.array(y_master)

In [80]:
# Build datasets
print("\n--- Building Training Set ---")
X_train, y_train = build_dataset(train_files)

print("\n--- Building Testing Set ---")
X_test, y_test = build_dataset(test_files)

# 5. Final Output Stats
print("\n--- DONE ---")
print(f"X_train shape: {X_train.shape}") # Expect (N_samples, 6, 128, 128)
print(f"y_train shape: {y_train.shape}") # Expect (N_samples, 6, 128)

# 6. Save to disk
save_path = output_dir / "guitar_hex_data.npz"
np.savez(save_path, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
print(f"Saved dataset to {save_path}")


--- Building Training Set ---
  Processed 0/285...
  Processed 10/285...
  Processed 20/285...
  Processed 30/285...
  Processed 40/285...
  Processed 50/285...
  Processed 60/285...
  Processed 70/285...
  Processed 80/285...
  Processed 90/285...
  Processed 100/285...
  Processed 110/285...
  Processed 120/285...
  Processed 130/285...
  Processed 140/285...
  Processed 150/285...
  Processed 160/285...
  Processed 170/285...
  Processed 180/285...
  Processed 190/285...
  Processed 200/285...
  Processed 210/285...
  Processed 220/285...
  Processed 230/285...
  Processed 240/285...
  Processed 250/285...
  Processed 260/285...
  Processed 270/285...
  Processed 280/285...

--- Building Testing Set ---
  Processed 0/72...
  Processed 10/72...
  Processed 20/72...
  Processed 30/72...
  Processed 40/72...
  Processed 50/72...
  Processed 60/72...
  Processed 70/72...

--- DONE ---
X_train shape: (5601, 6, 128, 128)
y_train shape: (5601, 6, 128)
Saved dataset to /Users/bm343/Documen