In [35]:
import music21


midi_file = "dataset/chopin/chopin_ballade_38_(c)galimberti.mid"
parsed_stream = music21.converter.parse(midi_file)


flat_stream = parsed_stream.flatten()
notes_to_parse = flat_stream.notesAndRests


for element in notes_to_parse:
    if element.isNote:

        pitch = element.pitch.midi  
        duration = float(element.quarterLength)  
        octave = element.pitch.octave  # Note octave
        pitch_class = element.pitch.pitchClass  # Note pitch class (0-11, where 0=C, 1=C#, etc.)
        
        # Access volume if available
        if hasattr(element, 'volume'):
            velocity = element.volume.velocity
        
    elif element.isChord:
        # Access chord properties
        chord_pitches = [n.pitch.midi for n in element.notes]
        chord_duration = float(element.quarterLength)

# You can also access other MIDI properties:
key = parsed_stream.analyze('key')  # Get key signature
time_signatures = parsed_stream.getTimeSignatures()  # Get time signatures
tempo_markings = parsed_stream.getElementsByClass('MetronomeMark')  # Get tempo markings

In [7]:
import music21

# Load MIDI file
midi_file = "dataset/chopin/chopin_ballade_38_(c)galimberti.mid"
parsed_stream = music21.converter.parse(midi_file)
flat_stream = parsed_stream.flatten()
notes_to_parse = flat_stream.notesAndRests

# 1. Pitch Class Distribution (Harmonic preferences)
pitch_class_counts = [0] * 12  # One count for each pitch class (C, C#, D, etc.)
for note in flat_stream.notes:
    if note.isNote:
        pitch_class_counts[note.pitch.pitchClass] += 1
    elif note.isChord:
        for pitch in note.pitches:
            pitch_class_counts[pitch.pitchClass] += 1

print("\n1. Pitch Class Distribution (shows harmonic preferences):")
pitch_classes = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
for pc, count in zip(pitch_classes, pitch_class_counts):
    print(f"{pc}: {count}")

# 2. Rhythm Complexity (Duration patterns)
durations = [float(n.quarterLength) for n in notes_to_parse]
unique_durations = set(durations)
print("\n2. Rhythmic Variety:")
print(f"Number of different note durations used: {len(unique_durations)}")
print(f"Duration types used: {sorted(unique_durations)}")

# 3. Chord Density (Texture complexity)
chords = flat_stream.getElementsByClass('Chord')
avg_chord_size = sum(len(c.pitches) for c in chords) / len(chords) if len(chords) > 0 else 0
print("\n3. Chord Usage (texture complexity):")
print(f"Average notes per chord: {avg_chord_size:.2f}")
print(f"Total number of chords: {len(chords)}")

# 4. Melodic Intervals (Melodic style)
melodic_intervals = []
for i in range(1, len(notes_to_parse)):
    if (notes_to_parse[i-1].isNote and notes_to_parse[i].isNote):
        interval = abs(notes_to_parse[i].pitch.midi - notes_to_parse[i-1].pitch.midi)
        melodic_intervals.append(interval)

if melodic_intervals:  # Check if list is not empty
    print("\n4. Melodic Movement:")
    print(f"Average interval size: {sum(melodic_intervals)/len(melodic_intervals):.2f} semitones")
    print(f"Largest melodic jump: {max(melodic_intervals)} semitones")
else:
    print("\n4. Melodic Movement: No intervals found")

# 5. Key Analysis (Tonality)
key = parsed_stream.analyze('key')
print("\n5. Key Information:")
print(f"Detected key: {key.tonic} {key.mode}")

# 6. Note Density (Compositional complexity)
total_duration = flat_stream.duration.quarterLength
total_notes = len(flat_stream.notes)
notes_per_quarter = total_notes / total_duration
print("\n6. Note Density:")
print(f"Notes per quarter note: {notes_per_quarter:.2f}")

# 7. Register Usage (Range preferences) - CORRECTED VERSION
all_pitches = []
for n in flat_stream.notes:
    if n.isNote:
        all_pitches.append(n.pitch.midi)
    elif n.isChord:
        all_pitches.extend([p.midi for p in n.pitches])

if all_pitches:  # Check if we found any pitches
    pitch_range = max(all_pitches) - min(all_pitches)
    print("\n7. Pitch Range:")
    print(f"Range span: {pitch_range} semitones")
    print(f"Lowest note: {min(all_pitches)} (MIDI number)")
    print(f"Highest note: {max(all_pitches)} (MIDI number)")
else:
    print("\n7. Pitch Range: No pitches found")

# 8. Time Signature Usage
time_signatures = flat_stream.getElementsByClass('TimeSignature')
print("\n8. Time Signatures Used:")
if len(time_signatures) > 0:
    for ts in time_signatures:
        print(f"{ts.numerator}/{ts.denominator}")
else:
    print("No time signatures found")

# 9. Dynamics Distribution
# 9. Dynamics Analysis (using MIDI velocity)
velocities = []
for n in flat_stream.notes:
    if n.isNote and hasattr(n, 'volume'):
        velocities.append(n.volume.velocity)
    elif n.isChord:
        # Get velocities from all notes in the chord
        for pitch in n.pitches:
            if hasattr(n, 'volume'):
                velocities.append(n.volume.velocity)

print("\n9. Dynamics Analysis (MIDI Velocity):")
if velocities:
    print(f"Average velocity: {sum(velocities)/len(velocities):.2f}")
    print(f"Velocity range: {min(velocities)} to {max(velocities)}")
    print(f"Velocity variance: {np.var(velocities):.2f}")
    
    # Categorize dynamics into ranges
    velocity_ranges = {
        'pp (very soft)': len([v for v in velocities if v < 45]),
        'p (soft)': len([v for v in velocities if 45 <= v < 60]),
        'mp (medium soft)': len([v for v in velocities if 60 <= v < 75]),
        'mf (medium loud)': len([v for v in velocities if 75 <= v < 90]),
        'f (loud)': len([v for v in velocities if 90 <= v < 105]),
        'ff (very loud)': len([v for v in velocities if v >= 105])
    }
    
    print("\nDynamic Distribution:")
    total_notes = sum(velocity_ranges.values())
    for dynamic, count in velocity_ranges.items():
        percentage = (count / total_notes) * 100 if total_notes > 0 else 0
        print(f"{dynamic}: {count} notes ({percentage:.1f}%)")
else:
    print("No velocity information found in MIDI")
    
# 10. Phrase Length Analysis
notes_rests = list(notes_to_parse)
silent_gaps = [i for i, nr in enumerate(notes_rests) if nr.isRest and nr.quarterLength >= 1.0]
if silent_gaps:
    phrase_lengths = [silent_gaps[i] - silent_gaps[i-1] if i > 0 else silent_gaps[0] 
                     for i in range(len(silent_gaps))]
    print("\n10. Phrase Analysis:")
    print(f"Average phrase length: {sum(phrase_lengths)/len(phrase_lengths):.2f} notes")
    print(f"Number of detected phrases: {len(phrase_lengths)}")
else:
    print("\n10. Phrase Analysis: No clear phrases detected")


1. Pitch Class Distribution (shows harmonic preferences):
C: 576
C#: 104
D: 405
D#: 250
E: 512
F: 496
F#: 144
G: 313
G#: 163
A: 637
A#: 295
B: 301

2. Rhythmic Variety:
Number of different note durations used: 24
Duration types used: [0.16666666666666666, 0.25, 0.3333333333333333, 0.4166666666666667, 0.5, 0.75, 0.8333333333333334, 1.0, 1.25, 1.3333333333333333, 1.4166666666666667, 1.5, 1.75, 2.0, 2.25, 2.5, 2.6666666666666665, 2.75, 3.0, 4.25, 4.5, 5.75, 6.5, 7.0]

3. Chord Usage (texture complexity):
Average notes per chord: 3.10
Total number of chords: 1152

4. Melodic Movement:
Average interval size: 4.13 semitones
Largest melodic jump: 32 semitones

5. Key Information:
Detected key: F major

6. Note Density:
Notes per quarter note: 2.79

7. Pitch Range:
Range span: 74 semitones
Lowest note: 26 (MIDI number)
Highest note: 100 (MIDI number)

8. Time Signatures Used:
6/8

9. Dynamics Analysis (MIDI Velocity):
Average velocity: 81.95
Velocity range: 25 to 127
Velocity variance: 805.87

In [29]:
import music21
import pandas as pd
import numpy as np
from collections import Counter

def extract_composer_features(midi_file):
    """
    Extract composer-related features from a MIDI file.
    Features included:
    1. Pitch Class Distribution (12 features)
    2. Note Density (notes_per_quarter)
    3. Average Chord Size (avg_chord_notes)
    4. Chord Progression Variety (chord_progression_variety)
    5. Note Duration Entropy (note_duration_entropy)
    6. Melodic Contour Ratio (melodic_contour_ratio)
    7. Range Span (range_span, lowest_note, highest_note)
    """
    
    # --------------------------
    # 1) Parse MIDI File
    # --------------------------
    parsed_stream = music21.converter.parse(midi_file)
    flat_stream = parsed_stream.flatten()
    
    # We'll use these for melodic analysis
    notes_to_parse = flat_stream.notesAndRests
    
    # Prepare a dictionary for our features
    features = {}
    
    # --------------------------
    # 2) Pitch Class Distribution
    # --------------------------
    pitch_class_counts = [0] * 12  # For C, C#, D, D#, E, F, F#, G, G#, A, A#, B
    
    for element in flat_stream.notes:
        if element.isNote:
            pitch_class_counts[element.pitch.pitchClass] += 1
        elif element.isChord:
            for p in element.pitches:
                pitch_class_counts[p.pitchClass] += 1
                
    pitch_classes = ['C', 'C#', 'D', 'D#', 'E', 'F', 
                     'F#', 'G', 'G#', 'A', 'A#', 'B']
    
    for pc, count in zip(pitch_classes, pitch_class_counts):
        features[f'pitch_{pc}'] = count
    
    # --------------------------
    # 3) Note Density
    # --------------------------
    total_duration = flat_stream.duration.quarterLength
    total_notes = len([n for n in flat_stream.notes if n.isNote or n.isChord])
    features['notes_per_quarter'] = (
        total_notes / total_duration if total_duration > 0 else 0
    )
    
    # --------------------------
    # 4) Average Chord Size
    # (Keeping 'avg_chord_notes' from original)
    # --------------------------
    chords = flat_stream.getElementsByClass('Chord')
    if len(chords) > 0:
        avg_chord_size = sum(len(c.pitches) for c in chords) / len(chords)
    else:
        avg_chord_size = 0
    features['avg_chord_notes'] = avg_chord_size
    
    # ------------------------------------------------
    # NEW FEATURE: Chord Progression Variety
    # (Replacing 'total_chords')
    # ------------------------------------------------
    # We'll chordify the piece so each slice is turned into a chord.
    chordified_stream = parsed_stream.chordify()
    chord_symbols = []
    
    # Attempt to get RomanNumerals for each chord
    # If no key context can be found, skip it.
    for c in chordified_stream.recurse().getElementsByClass(music21.chord.Chord):
        try:
            # Attempt to get chord's key context
            key_context = c.getContextByClass(music21.key.Key)
            # Convert chord to Roman Numeral notation
            roman_chord = music21.roman.romanNumeralFromChord(c, key_context)
            chord_symbols.append(roman_chord.romanNumeral)
        except:
            # In case of exceptions (e.g. no clear key), just skip
            pass
    
    # Count bigrams (chord-to-chord transitions)
    bigram_counts = Counter()
    for i in range(len(chord_symbols) - 1):
        bigram = (chord_symbols[i], chord_symbols[i+1])
        bigram_counts[bigram] += 1
    
    # Define chord_progression_variety = # of unique bigrams / total bigrams
    total_bigrams = sum(bigram_counts.values())
    chord_progression_variety = (
        len(bigram_counts) / total_bigrams if total_bigrams > 0 else 0
    )
    features['chord_progression_variety'] = chord_progression_variety
    
    # ------------------------------------------------
    # NEW FEATURE: Note Duration Entropy
    # (Replacing 'rhythm_variety')
    # ------------------------------------------------
    def get_note_duration_entropy(durations):
        if not durations:
            return 0.0
        counts = Counter(durations)
        total = sum(counts.values())
        probs = [c / total for c in counts.values()]
        return -sum(p * np.log2(p) for p in probs)
    
    durations = [float(n.quarterLength) for n in notes_to_parse if n.quarterLength > 0]
    features['note_duration_entropy'] = get_note_duration_entropy(durations)
    
    # ------------------------------------------------
    # NEW FEATURE: Melodic Contour Ratio
    # (Replacing 'avg_interval' and 'max_interval')
    # ------------------------------------------------
    # We'll measure the ratio of upward intervals to total intervals
    upward_intervals = 0
    total_intervals = 0
    
    for i in range(1, len(notes_to_parse)):
        prev_el = notes_to_parse[i - 1]
        curr_el = notes_to_parse[i]
        if prev_el.isNote and curr_el.isNote:
            interval_diff = curr_el.pitch.midi - prev_el.pitch.midi
            if interval_diff != 0:
                total_intervals += 1
                if interval_diff > 0:
                    upward_intervals += 1
    
    features['melodic_contour_ratio'] = (
        upward_intervals / total_intervals if total_intervals > 0 else 0
    )
    
    # --------------------------
    # 5) Range Span (kept as is)
    # --------------------------
    all_pitches = []
    for n in flat_stream.notes:
        if n.isNote:
            all_pitches.append(n.pitch.midi)
        elif n.isChord:
            all_pitches.extend(p.midi for p in n.pitches)
            
    if all_pitches:
        features['range_span'] = max(all_pitches) - min(all_pitches)
        features['lowest_note'] = min(all_pitches)
        features['highest_note'] = max(all_pitches)
    else:
        features['range_span'] = 0
        features['lowest_note'] = 0
        features['highest_note'] = 0
    
    # Create a DataFrame with a single row of all these features
    df = pd.DataFrame([features])
    
    return df

In [13]:
df = extract_composer_features(midi_file)

In [14]:
df.columns

Index(['pitch_C', 'pitch_C#', 'pitch_D', 'pitch_D#', 'pitch_E', 'pitch_F',
       'pitch_F#', 'pitch_G', 'pitch_G#', 'pitch_A', 'pitch_A#', 'pitch_B',
       'notes_per_quarter', 'avg_chord_notes', 'total_chords', 'avg_interval',
       'max_interval', 'rhythm_variety', 'range_span', 'lowest_note',
       'highest_note'],
      dtype='object')

In [6]:
df

Unnamed: 0,pitch_C,pitch_C#,pitch_D,pitch_D#,pitch_E,pitch_F,pitch_F#,pitch_G,pitch_G#,pitch_A,...,pitch_B,notes_per_quarter,avg_chord_notes,total_chords,avg_interval,max_interval,rhythm_variety,range_span,lowest_note,highest_note
0,576,104,405,250,512,496,144,313,163,637,...,301,822/295,3.096354,1152,4.12585,32,24,74,26,100


In [30]:
import os
import glob
from tqdm import tqdm
COMPOSERS = ["haydn", "mendelssohn", "mozart", "chopin", "schubert", "tchaikovsky"]
def get_midi_file_size(file_path):
   return os.path.getsize(file_path)
    
def process_composers_with_individual_thresholds(data_dir, composers, n_samples=15, initial_threshold_kb=8.0):
    def get_valid_files_for_composer(composer, threshold_kb):
        threshold_bytes = threshold_kb * 1024
        midi_files = glob.glob(os.path.join(data_dir, composer, '**', '*.mid'), recursive=True)
        return [f for f in midi_files if get_midi_file_size(f) <= threshold_bytes]

    processed_files = set()
    all_features = []
    
    for composer in tqdm(composers, desc="Processing composers"):
        threshold_kb = initial_threshold_kb
        files_processed = 0
        
        while files_processed < n_samples:
            valid_files = get_valid_files_for_composer(composer, threshold_kb)
            valid_files = [f for f in valid_files if f not in processed_files]
            
            if not valid_files:
                print(f"{composer}: No files at {threshold_kb:.1f}kb threshold. Increasing...")
                threshold_kb *= 1.1
                continue
                
            for midi_file in valid_files:
                if files_processed >= n_samples:
                    break
                try:
                    features_df = extract_composer_features(midi_file)
                    features_df['composer'] = composer
                    features_df['file_size_kb'] = get_midi_file_size(midi_file) / 1024
                    features_df['threshold_used_kb'] = threshold_kb
                    all_features.append(features_df)
                    processed_files.add(midi_file)
                    files_processed += 1
                except Exception as e:
                    print(f"Error processing {midi_file}: {e}")

    return pd.concat(all_features, ignore_index=True)
data_dir = "dataset"
df = process_composers_with_individual_thresholds(data_dir, COMPOSERS, n_samples=15, initial_threshold_kb=8.0)

Processing composers:  17%|██████████▏                                                  | 1/6 [02:19<11:39, 139.89s/it]

mendelssohn: No files at 8.0kb threshold. Increasing...
mendelssohn: No files at 8.8kb threshold. Increasing...
mendelssohn: No files at 9.7kb threshold. Increasing...
mendelssohn: No files at 10.6kb threshold. Increasing...
mendelssohn: No files at 11.7kb threshold. Increasing...
mendelssohn: No files at 12.9kb threshold. Increasing...


Processing composers:  67%|█████████████████████████████████████████▎                    | 4/6 [06:23<02:40, 80.19s/it]

schubert: No files at 8.0kb threshold. Increasing...
schubert: No files at 8.8kb threshold. Increasing...
schubert: No files at 9.7kb threshold. Increasing...


Processing composers: 100%|██████████████████████████████████████████████████████████████| 6/6 [08:22<00:00, 83.69s/it]


In [46]:
import numpy as np
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV, KFold
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier


X = df.drop(columns=['composer', 'file_size_kb', 'threshold_used_kb'])
y = df['composer']


encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)


In [63]:
# Replace the problematic values manually
X.loc[52, 'notes_per_quarter'] = 7032 / 3173  # Perform the division for row 52
X.loc[58, 'notes_per_quarter'] = 3600 / 959   # Perform the division for row 58

# Convert the entire column to float
X['notes_per_quarter'] = X['notes_per_quarter'].astype(float)

# Verify the changes
print(X.loc[[52, 58], 'notes_per_quarter'])  # Check the corrected rows
print(X['notes_per_quarter'].dtype)         # Ensure it's now float

52    2.216199
58    3.753910
Name: notes_per_quarter, dtype: float64
float64


In [97]:
df.to_pickle("midi_data_all_dataframe.pkl")
print("Data saved as Pickle!")


Data saved as Pickle!


In [100]:
df

Unnamed: 0,file_path,composer,key_signature,tempo_changes,time_signatures,num_notes,note_offset,note_duration,note_midi,note_octave,note_name
0,dataset\haydn\haydn_12_german_dances_1792_(c)i...,haydn,7.0,"[{'offset': 0.0, 'bpm': 600.0}, {'offset': 0.2...","[{'offset': 0.0, 'numerator': 3, 'denominator'...",4144,0.200000,0.347656,74,5,D5
1,dataset\haydn\haydn_12_german_dances_1792_(c)i...,haydn,7.0,"[{'offset': 0.0, 'bpm': 600.0}, {'offset': 0.2...","[{'offset': 0.0, 'numerator': 3, 'denominator'...",4144,0.575000,0.175781,78,5,F#5
2,dataset\haydn\haydn_12_german_dances_1792_(c)i...,haydn,7.0,"[{'offset': 0.0, 'bpm': 600.0}, {'offset': 0.2...","[{'offset': 0.0, 'numerator': 3, 'denominator'...",4144,0.575000,1.035156,55,3,G3
3,dataset\haydn\haydn_12_german_dances_1792_(c)i...,haydn,7.0,"[{'offset': 0.0, 'bpm': 600.0}, {'offset': 0.2...","[{'offset': 0.0, 'numerator': 3, 'denominator'...",4144,0.762500,0.175781,79,5,G5
4,dataset\haydn\haydn_12_german_dances_1792_(c)i...,haydn,7.0,"[{'offset': 0.0, 'bpm': 600.0}, {'offset': 0.2...","[{'offset': 0.0, 'numerator': 3, 'denominator'...",4144,0.950000,0.175781,83,5,B5
...,...,...,...,...,...,...,...,...,...,...,...
7863121,dataset\tchaikovsky\tchajkowski_string_quartet...,tchaikovsky,6.0,"[{'offset': 0.0, 'bpm': 60.0}, {'offset': 105....","[{'offset': 0.0, 'numerator': 6, 'denominator'...",18937,2155.773231,1.750000,63,4,D#4
7863122,dataset\tchaikovsky\tchajkowski_string_quartet...,tchaikovsky,6.0,"[{'offset': 0.0, 'bpm': 60.0}, {'offset': 105....","[{'offset': 0.0, 'numerator': 6, 'denominator'...",18937,2155.773231,1.750000,58,3,A#3
7863123,dataset\tchaikovsky\tchajkowski_string_quartet...,tchaikovsky,6.0,"[{'offset': 0.0, 'bpm': 60.0}, {'offset': 105....","[{'offset': 0.0, 'numerator': 6, 'denominator'...",18937,2155.773231,1.750000,55,3,G3
7863124,dataset\tchaikovsky\tchajkowski_string_quartet...,tchaikovsky,6.0,"[{'offset': 0.0, 'bpm': 60.0}, {'offset': 105....","[{'offset': 0.0, 'numerator': 6, 'denominator'...",18937,2155.773231,1.750000,39,2,D#2


In [None]:
def extract_midi_details(midi_file):
    """
    Extracts detailed information from a MIDI file.
    """
    try:
        midi_data = pretty_midi.PrettyMIDI(midi_file)

        # Extract notes
        notes_list = []
        note_lengths = []
        octaves = []

        for instrument in midi_data.instruments:
            for note in instrument.notes:
                notes_list.append(pretty_midi.note_number_to_name(note.pitch))
                note_lengths.append(note.end - note.start)
                octaves.append(note.pitch // 12 - 1)

        # Ensure notes are in order by offset
        notes_with_details = sorted(
            zip(notes_list, note_lengths, octaves, [note.start for note in instrument.notes]),
            key=lambda x: x[3]
        )

        notes_list = [n[0] for n in notes_with_details]
        note_lengths = [n[1] for n in notes_with_details]
        octaves = [n[2] for n in notes_with_details]

        # Tempo changes
        offsets, bpms = midi_data.get_tempo_changes()
        tempos = [{"offset": offset, "bpm": bpm} for offset, bpm in zip(offsets, bpms)]

        # Time signatures
        time_signatures = [
            {"offset": ts.time, "numerator": ts.numerator, "denominator": ts.denominator}
            for ts in midi_data.time_signature_changes
        ]

        # Number of notes
        num_notes = len(notes_list)

        # Overall key (first key change, if present)
        key_signature = (
            midi_data.key_signature_changes[0].key_number
            if midi_data.key_signature_changes else None
        )

        return {
            "notes": notes_list,
            "note_lengths": note_lengths,
            "octaves": octaves,
            "overall_key": key_signature,
            "num_notes": num_notes,
            "tempos": tempos,
            "musical_meters": time_signatures,
        }

    except Exception as e:
        return {"error": str(e), "file": midi_file}


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [2]:
import pandas as pd
df = pd.read_pickle("midi_data_all_dataframe.pkl")

# Print the DataFrame
print(df)


                                                 file_path     composer  \
0        dataset\haydn\haydn_12_german_dances_1792_(c)i...        haydn   
1        dataset\haydn\haydn_12_german_dances_1792_(c)i...        haydn   
2        dataset\haydn\haydn_12_german_dances_1792_(c)i...        haydn   
3        dataset\haydn\haydn_12_german_dances_1792_(c)i...        haydn   
4        dataset\haydn\haydn_12_german_dances_1792_(c)i...        haydn   
...                                                    ...          ...   
7863121  dataset\tchaikovsky\tchajkowski_string_quartet...  tchaikovsky   
7863122  dataset\tchaikovsky\tchajkowski_string_quartet...  tchaikovsky   
7863123  dataset\tchaikovsky\tchajkowski_string_quartet...  tchaikovsky   
7863124  dataset\tchaikovsky\tchajkowski_string_quartet...  tchaikovsky   
7863125  dataset\tchaikovsky\tchajkowski_string_quartet...  tchaikovsky   

         key_signature                                      tempo_changes  \
0                  7.0

In [5]:
import pandas as pd

# Assuming `df` is the DataFrame loaded from your pickle file
# Step 1: Extract file name
df['file_name'] = df['file_path'].str.extract(r'([^\\]+)\.midi')

# Step 2: Group by file name
grouped = df.groupby('file_name')


In [7]:
print(grouped)

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000017CECFAE300>


In [2]:
import pickle

# Load the pickle file
file_path = "REVISED_midi_data_all.pkl"  # Update this path if necessary
with open(file_path, "rb") as f:
    data = pickle.load(f)

# Inspect the structure
print(f"Top-level keys (composers): {data.keys()}")
for composer in data:
    print(f"Composer: {composer}, Number of files: {len(data[composer])}")


Top-level keys (composers): dict_keys(['haydn', 'mendelssohn', 'mozart', 'chopin', 'schubert', 'tchaikovsky'])
Composer: haydn, Number of files: 737
Composer: mendelssohn, Number of files: 45
Composer: mozart, Number of files: 605
Composer: chopin, Number of files: 98
Composer: schubert, Number of files: 271
Composer: tchaikovsky, Number of files: 239


In [47]:
import numpy as np
import pickle

# Load the pickle file
with open("midi_data_all.pkl", "rb") as f:
    data = pickle.load(f)

# Prepare lists to store features and labels
X = []  # Features
y = []  # Labels (composer names)

# Define a mapping of notes to numerical values
note_to_index = {
    f"{note}{octave}": octave * 12 + idx
    for octave in range(0, 10)  # Covers octaves 0 to 9 (inclusive)
    for idx, note in enumerate(["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"])
}

def encode_notes(notes):
    """
    Encode note names into numerical values.
    """ 
    return [note_to_index[note] for note in notes if note in note_to_index]

# Iterate over composers and their files
for composer, files in data.items():
    for file_name, details in files.items():
        if "error" in details:
            continue  # Skip files with errors

        # Extract features
        notes = encode_notes(details["notes"])  # Numerical representation of notes
        note_lengths = details["note_lengths"]
        octaves = details["octaves"]
        overall_key = details["overall_key"]
        num_notes = details["num_notes"]
        tempos = [t["bpm"] for t in details["tempos"]]
        musical_meters = [(m["numerator"], m["denominator"]) for m in details["musical_meters"]]

        # Flatten the features for simplicity (e.g., concatenate arrays)
        features = {
            "notes": notes,
            "note_lengths": note_lengths,
            "octaves": octaves,
            "overall_key": overall_key,
            "num_notes": num_notes,
            "tempos": tempos,
            "musical_meters": musical_meters,
        }

        # Append features and label
        X.append(features)
        y.append(composer)

# Convert labels to integers
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)


# Remove files with zero notes
filtered_X = []
filtered_y = []

for i, features in enumerate(X):
    if len(features["notes"]) > 0:  # Keep only files with notes
        filtered_X.append(features)
        filtered_y.append(y[i])

# Update X and y
X = filtered_X
y = filtered_y

print(f"Number of valid files: {len(X)}")  # Optional: Check how many files remain


Number of valid files: 1807


In [50]:
print(note_to_index)

{'C0': 0, 'C#0': 1, 'D0': 2, 'D#0': 3, 'E0': 4, 'F0': 5, 'F#0': 6, 'G0': 7, 'G#0': 8, 'A0': 9, 'A#0': 10, 'B0': 11, 'C1': 12, 'C#1': 13, 'D1': 14, 'D#1': 15, 'E1': 16, 'F1': 17, 'F#1': 18, 'G1': 19, 'G#1': 20, 'A1': 21, 'A#1': 22, 'B1': 23, 'C2': 24, 'C#2': 25, 'D2': 26, 'D#2': 27, 'E2': 28, 'F2': 29, 'F#2': 30, 'G2': 31, 'G#2': 32, 'A2': 33, 'A#2': 34, 'B2': 35, 'C3': 36, 'C#3': 37, 'D3': 38, 'D#3': 39, 'E3': 40, 'F3': 41, 'F#3': 42, 'G3': 43, 'G#3': 44, 'A3': 45, 'A#3': 46, 'B3': 47, 'C4': 48, 'C#4': 49, 'D4': 50, 'D#4': 51, 'E4': 52, 'F4': 53, 'F#4': 54, 'G4': 55, 'G#4': 56, 'A4': 57, 'A#4': 58, 'B4': 59, 'C5': 60, 'C#5': 61, 'D5': 62, 'D#5': 63, 'E5': 64, 'F5': 65, 'F#5': 66, 'G5': 67, 'G#5': 68, 'A5': 69, 'A#5': 70, 'B5': 71, 'C6': 72, 'C#6': 73, 'D6': 74, 'D#6': 75, 'E6': 76, 'F6': 77, 'F#6': 78, 'G6': 79, 'G#6': 80, 'A6': 81, 'A#6': 82, 'B6': 83, 'C7': 84, 'C#7': 85, 'D7': 86, 'D#7': 87, 'E7': 88, 'F7': 89, 'F#7': 90, 'G7': 91, 'G#7': 92, 'A7': 93, 'A#7': 94, 'B7': 95, 'C8': 96,

In [70]:
lengths = [len(features["note_lengths"]) for features in X]
print(f"Max length: {max(lengths)}, Min length: {min(lengths)}, Average length: {np.mean(lengths)}")


Max length: 47134, Min length: 25, Average length: 4253.579413392363


In [57]:
def prepare_features_notes(X, max_len=1000):
    """
    Prepare features for LSTM using only notes.
    """
    feature_matrix = []
    for features in X:
        # Truncate or pad the notes to a fixed length
        padded_notes = np.pad(
            features["notes"], (0, max(0, max_len - len(features["notes"])))
        )[:max_len]
        feature_matrix.append(padded_notes)
    return np.array(feature_matrix)  # Shape: (num_samples, max_len)

# Prepare features
X_features = prepare_features_notes(X, max_len=1000)  # Adjust max_len as needed
print(f"Feature matrix shape: {X_features.shape}")


Feature matrix shape: (1807, 1000)


In [58]:
X_features.shape

(1807, 1000)

In [54]:
len(y)

1807

In [59]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_features, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Training set: {X_train.shape}, Testing set: {X_test.shape}")


Training set: (1445, 1000), Testing set: (362, 1000)


In [32]:
X_train_tensor.shape

torch.Size([1544, 100, 1])

In [60]:
import torch
import numpy as np

# Add an extra dimension for feature size
X_train_tensor = torch.tensor(X_train[:, :, np.newaxis], dtype=torch.float32)
X_test_tensor = torch.tensor(X_test[:, :, np.newaxis], dtype=torch.float32)

# Convert labels to PyTorch tensors
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

print(f"X_train_tensor shape: {X_train_tensor.shape}, y_train_tensor shape: {y_train_tensor.shape}")
print(f"X_test_tensor shape: {X_test_tensor.shape}, y_test_tensor shape: {y_test_tensor.shape}")


X_train_tensor shape: torch.Size([1445, 1000, 1]), y_train_tensor shape: torch.Size([1445])
X_test_tensor shape: torch.Size([362, 1000, 1]), y_test_tensor shape: torch.Size([362])


In [61]:
import torch.nn as nn

class ComposerClassifierLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(ComposerClassifierLSTM, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)  # Embedding for notes
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        x = self.embedding(x.squeeze(-1).long())  # Convert note indices to embeddings
        _, (hidden, _) = self.lstm(x)
        out = self.fc(hidden[-1])  # Use the last hidden state
        return out

# Model parameters
input_size = len(note_to_index)  # Total number of unique notes
hidden_size = 128  # Number of hidden units in the LSTM
num_layers = 2  # Number of LSTM layers
num_classes = len(label_encoder.classes_)  # Number of composers

model = ComposerClassifierLSTM(input_size, hidden_size, num_layers, num_classes)
print(model)


ComposerClassifierLSTM(
  (embedding): Embedding(120, 128)
  (lstm): LSTM(128, 128, num_layers=2, batch_first=True)
  (fc): Linear(in_features=128, out_features=6, bias=True)
)


In [62]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [63]:
num_epochs = 10
batch_size = 32


train_dataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)


for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(train_loader)}")


Epoch 1/10, Loss: 1.5140647888183594
Epoch 2/10, Loss: 1.3956472510876863
Epoch 3/10, Loss: 1.3572009335393491
Epoch 4/10, Loss: 1.3206292805464372
Epoch 5/10, Loss: 1.2674581149350042
Epoch 6/10, Loss: 1.1975724710070568
Epoch 7/10, Loss: 1.1273693066576254
Epoch 8/10, Loss: 1.034328519002251
Epoch 9/10, Loss: 0.915683785210485
Epoch 10/10, Loss: 0.8151581922303075


In [64]:
model.eval()
with torch.no_grad():
    test_outputs = model(X_test_tensor)
    _, predicted = torch.max(test_outputs, 1)
    accuracy = (predicted == y_test_tensor).sum().item() / len(y_test_tensor)

print(f"Test Accuracy: {accuracy * 100:.2f}%")


Test Accuracy: 37.57%


In [71]:
import os

def count_midi_files(base_directory):
    composer_counts = {}  # Dictionary to store the count for each composer

    # Walk through the base directory
    for composer in os.listdir(base_directory):
        composer_path = os.path.join(base_directory, composer)

        # Skip if not a directory
        if not os.path.isdir(composer_path):
            continue

        # Initialize a count for the composer
        midi_count = 0

        # Walk through the composer directory recursively
        for root, dirs, files in os.walk(composer_path):
            # Skip the '!live!' folder
            if "!live!" in dirs:
                dirs.remove("!live!")

            # Count MIDI files in the current directory
            midi_count += sum(1 for file in files if file.endswith(('.mid', '.midi')))

        # Store the count for the composer
        composer_counts[composer] = midi_count

    return composer_counts

# Directory containing the dataset
base_directory = "MUSIC_DATASET"  # Replace with the actual path

# Count MIDI files
composer_counts = count_midi_files(base_directory)

# Print the results
for composer, count in composer_counts.items():
    print(f"{composer}: {count} MIDI files")

# Total MIDI files
total_files = sum(composer_counts.values())
print(f"Total MIDI files: {total_files}")

albeniz: 61 MIDI files
albinoni: 49 MIDI files
albrechtsberger: 13 MIDI files
alkan: 237 MIDI files
anglebert: 67 MIDI files
anonymous: 65 MIDI files
bach-js: 2161 MIDI files
bartok: 12 MIDI files
beethoven: 644 MIDI files
berlioz: 8 MIDI files
bizet: 21 MIDI files
brahms: 146 MIDI files
bruckner: 30 MIDI files
busoni: 37 MIDI files
buxtehude: 94 MIDI files
byrd: 109 MIDI files
chopin: 98 MIDI files
clementi: 41 MIDI files
couperin: 117 MIDI files
cramer: 0 MIDI files
dandrieu: 211 MIDI files
debussy: 159 MIDI files
desprez: 35 MIDI files
dowland: 61 MIDI files
dufay: 13 MIDI files
dvorak: 145 MIDI files
faure: 90 MIDI files
franck: 22 MIDI files
frescobaldi: 62 MIDI files
froberger: 32 MIDI files
gabrieli's: 28 MIDI files
gershwin: 10 MIDI files
gesualdo: 37 MIDI files
godowsky: 58 MIDI files
gottschalk: 36 MIDI files
grieg: 17 MIDI files
guilmant: 11 MIDI files
handel: 527 MIDI files
haydn: 737 MIDI files
hindemith: 8 MIDI files
janacek: 22 MIDI files
joplin: 0 MIDI files
karg-elert:

In [1]:
COMPOSERS = sorted(["handel", "alkan", "schubert", "mozart", "scarlatti", "victoria"])


In [2]:
print(COMPOSERS)

['alkan', 'handel', 'mozart', 'scarlatti', 'schubert', 'victoria']
