In [None]:
import os
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from IPython.display import clear_output
from collections import defaultdict

In [None]:
base_folder = './'
processed_dataset = './DatasetPro'
index_file = 'sample_index.csv'
chord_distribution_file = os.path.join(processed_dataset, 'chord_distribution.csv')
split_index_file = os.path.join(processed_dataset, 'split_index.csv')
train_output_file = os.path.join(processed_dataset, 'train_features.csv')
val_output_file = os.path.join(processed_dataset, 'val_features.csv')
test_output_file = os.path.join(processed_dataset, 'test_features.csv')

In [None]:
# Load the chord distribution file
chord_distribution_df = pd.read_csv(chord_distribution_file)

# Calculate total chords by summing the percentages
total_chords = chord_distribution_df['Percentage'].sum()

# Find chords that appear less than 2% of the total chords
rare_chords = chord_distribution_df[chord_distribution_df["Percentage"] < (2 / total_chords) * 100]  # Less than 2%

# Display rare chords
print("🔍 Rare Chords (less than 2% occurrences):")
print(rare_chords)


🔍 Rare Chords (less than 2% occurrences):
     Chord  Percentage  Chord_Count
17   G:min    1.809031        74946
18   C:min    1.490460        61748
19  C#:min    1.404530        58188
20  F#:min    1.403467        58144
21   F:min    1.295885        53687
22     E:7    1.259630        52185
23  Eb:min    1.254296        51964
24     G:7    1.183838        49045
25     A:7    1.147245        47529
26     D:7    1.138459        47165
27     C:7    1.114007        46152
28     F:7    1.047097        43380
29    Ab:7    0.982867        40719
30    Eb:7    0.921677        38184
31  Bb:min    0.777937        32229
32     B:7    0.754282        31249
33  Ab:min    0.667651        27660
34    Bb:7    0.558621        23143
35    C#:7    0.527942        21872
36    F#:7    0.510080        21132
37     NaN    0.011007          456


In [None]:
# Convert all values to strings and then sort
all_chords = sorted(chord_distribution_df['Chord'].dropna().astype(str).unique())
print(all_chords)

['A:7', 'A:maj', 'A:min', 'Ab:7', 'Ab:maj', 'Ab:min', 'B:7', 'B:maj', 'B:min', 'Bb:7', 'Bb:maj', 'Bb:min', 'C#:7', 'C#:maj', 'C#:min', 'C:7', 'C:maj', 'C:min', 'D:7', 'D:maj', 'D:min', 'E:7', 'E:maj', 'E:min', 'Eb:7', 'Eb:maj', 'Eb:min', 'F#:7', 'F#:maj', 'F#:min', 'F:7', 'F:maj', 'F:min', 'G:7', 'G:maj', 'G:min', 'N']


In [None]:
# Load the chord distribution file
chord_distribution_df = pd.read_csv(chord_distribution_file)

# Load sample index file (contains all sample IDs)
index_df = pd.read_csv(index_file)

# Define split ratios
train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15

# Threshold for rare chords
rare_threshold = (1 / chord_distribution_df['Percentage'].sum()) * 100

# Identify rare chords
rare_chords = chord_distribution_df[chord_distribution_df["Percentage"] < rare_threshold]['Chord'].tolist()

print(f"🔍 Identified {len(rare_chords)} rare chords :")
print(rare_chords)

# Step 1: Map Chords to Songs
chord_to_songs = defaultdict(set)
song_to_chords = {}

for sample_id in index_df['sample_id'].astype(str).str.zfill(4):  # Ensure correct formatting
    sample_folder = os.path.join(processed_dataset, sample_id)
    merged_file = os.path.join(sample_folder, 'merged_chroma_lab.csv')

    if os.path.exists(merged_file):
        df = pd.read_csv(merged_file)
        unique_chords = set(df['Final_Chord'].dropna().unique())

        # Store song → chords mapping
        song_to_chords[sample_id] = unique_chords

        # Store chord → songs mapping
        for chord in unique_chords:
            chord_to_songs[chord].add(sample_id)

print(f"🎵 Total songs processed: {len(song_to_chords)}")
print(f"🎸 Sample song with its chords: {list(song_to_chords.items())[0]}")


🔍 Identified 9 rare chords :
['Ab:7', 'Eb:7', 'Bb:min', 'B:7', 'Ab:min', 'Bb:7', 'C#:7', 'F#:7', nan]


KeyboardInterrupt: 

In [None]:
# Step 2: Balanced Song Selection
train_songs = set()
val_songs = set()
test_songs = set()

# Start with rare chord songs (so they are included in all splits)
rare_songs = {song for chord in rare_chords for song in chord_to_songs[chord]}

print(f"🎯 {len(rare_songs)} songs contain rare chords.")

# Assign rare chord songs first
for song in rare_songs:
    if len(train_songs) / len(index_df) < train_ratio:
        train_songs.add(song)
    elif len(val_songs) / len(index_df) < val_ratio:
        val_songs.add(song)
    else:
        test_songs.add(song)

print(f"✅ After rare chord assignment - Train: {len(train_songs)}, Val: {len(val_songs)}, Test: {len(test_songs)}")

# Assign remaining songs while balancing chord distributions
remaining_songs = list(set(song_to_chords.keys()) - rare_songs)
np.random.shuffle(remaining_songs)

for song in remaining_songs:
    if len(train_songs) / len(index_df) < train_ratio:
        train_songs.add(song)
    elif len(val_songs) / len(index_df) < val_ratio:
        val_songs.add(song)
    else:
        test_songs.add(song)

print(f"🎯 Final Split Counts - Train: {len(train_songs)}, Val: {len(val_songs)}, Test: {len(test_songs)}")


🎯 361 songs contain rare chords.
✅ After rare chord assignment - Train: 361, Val: 0, Test: 0
🎯 Final Split Counts - Train: 623, Val: 134, Test: 133


In [38]:
# Step 3: Save Index File with Splits
index_df['split'] = index_df['sample_id'].astype(str).str.zfill(4).apply(
    lambda x: 'train' if x in train_songs else ('val' if x in val_songs else 'test')
)

split_index_file = os.path.join(processed_dataset, 'split_index.csv')
index_df.to_csv(split_index_file, index=False)

print("🚀 Song-based stratified dataset split **completed** successfully!")
print(f"📁 Split details saved in {split_index_file}")

🚀 Song-based stratified dataset split **completed** successfully!
📁 Split details saved in ./DatasetPro/split_index.csv


In [60]:
import numpy as np
import os
import pandas as pd

# Initialize lists to store the features, labels, and sequence indices
X_train, y_train, sequence_indices_train = [], [], []
X_val, y_val, sequence_indices_val = [], [], []
X_test, y_test, sequence_indices_test = [], [], []

# Initialize sequence ID trackers
sequence_id_train = 0
sequence_id_val = 0
sequence_id_test = 0

# Iterate through the split_index and process the files accordingly
for idx, row in split_index_df.iterrows():
    sample_id = str(row['sample_id']).zfill(4)
    split = row['split']  # 'train', 'val', or 'test'
    
    if split not in ['train', 'val', 'test']:
        continue  # Skip invalid splits
    
    print(f"🔍 Processing sample {sample_id} for split: {split}...")

    sample_folder = os.path.join(processed_dataset, sample_id)
    merged_file = os.path.join(sample_folder, 'merged_chroma_lab.csv')

    if os.path.exists(merged_file):
        df = pd.read_csv(merged_file)
        
        # Extract the normal chroma (columns 14 to 25)
        normal_chroma = df.iloc[:, 13:25].values  # Shape: (timesteps, 12)
        
        # Labels (Final_Chord)
        labels = df['Final_Chord'].values
        
        # Filter out NaN labels
        valid_idx = ~pd.isna(labels)
        normal_chroma = normal_chroma[valid_idx]
        labels = labels[valid_idx]
        
        # Map labels to numeric values
        chord_mapping = {chord: idx for idx, chord in enumerate(all_chords)}
        labels_numeric = np.array([chord_mapping[chord] for chord in labels])
        
        # Store the sequence and track index boundaries based on split
        if split == 'train':
            X_train.append(normal_chroma)
            y_train.append(labels_numeric)
            sequence_indices_train.extend([sequence_id_train] * len(normal_chroma))  # Extend with sequence ID
            sequence_id_train += 1  # Increment sequence ID for next sequence
        elif split == 'val':
            X_val.append(normal_chroma)
            y_val.append(labels_numeric)
            sequence_indices_val.extend([sequence_id_val] * len(normal_chroma))  # Extend with sequence ID
            sequence_id_val += 1  # Increment sequence ID for next sequence
        elif split == 'test':
            X_test.append(normal_chroma)
            y_test.append(labels_numeric)
            sequence_indices_test.extend([sequence_id_test] * len(normal_chroma))  # Extend with sequence ID
            sequence_id_test += 1  # Increment sequence ID for next sequence

    print(f"✅ Finished processing sample {sample_id}.")

# Flatten the feature arrays and labels
X_train = np.concatenate(X_train, axis=0)
y_train = np.concatenate(y_train, axis=0)

X_val = np.concatenate(X_val, axis=0)
y_val = np.concatenate(y_val, axis=0)

X_test = np.concatenate(X_test, axis=0)
y_test = np.concatenate(y_test, axis=0)

# Convert sequence indices to numpy arrays (sequence indices should now match the lengths of X and y)
sequence_indices_train = np.array(sequence_indices_train)
sequence_indices_val = np.array(sequence_indices_val)
sequence_indices_test = np.array(sequence_indices_test)

# Save the data with proper headers
chroma_headers = df.columns[13:25].tolist()  # Chroma feature names
headers = ','.join(chroma_headers + ['label', 'sequence_id'])  # Updated header

# Stack the arrays and save them to CSV
np.savetxt(train_output_file, np.column_stack((X_train, y_train, sequence_indices_train)), delimiter=',', header=headers, comments='')
np.savetxt(val_output_file, np.column_stack((X_val, y_val, sequence_indices_val)), delimiter=',', header=headers, comments='')
np.savetxt(test_output_file, np.column_stack((X_test, y_test, sequence_indices_test)), delimiter=',', header=headers, comments='')

print(f"🎯 Dataset processing complete!")
print(f"📁 Train data saved in: {train_output_file}")
print(f"📁 Validation data saved in: {val_output_file}")
print(f"📁 Test data saved in: {test_output_file}")


🔍 Processing sample 0003 for split: train...
✅ Finished processing sample 0003.
🔍 Processing sample 0004 for split: train...
✅ Finished processing sample 0004.
🔍 Processing sample 0006 for split: test...
✅ Finished processing sample 0006.
🔍 Processing sample 0010 for split: train...
✅ Finished processing sample 0010.
🔍 Processing sample 0012 for split: val...
✅ Finished processing sample 0012.
🔍 Processing sample 0015 for split: train...
✅ Finished processing sample 0015.
🔍 Processing sample 0016 for split: train...
✅ Finished processing sample 0016.
🔍 Processing sample 0018 for split: train...
✅ Finished processing sample 0018.
🔍 Processing sample 0019 for split: train...
✅ Finished processing sample 0019.
🔍 Processing sample 0021 for split: train...
✅ Finished processing sample 0021.
🔍 Processing sample 0022 for split: train...
✅ Finished processing sample 0022.
🔍 Processing sample 0023 for split: train...
✅ Finished processing sample 0023.
🔍 Processing sample 0025 for split: test...