In [None]:
# importing all necessary libraries
import numpy as np 
import zipfile
import io
import pretty_midi
import os
import pandas as pd 
import tensorflow as tf 
from tensorflow import keras
from sklearn.preprocessing import StandardScaler, LabelEncoder
from collections import Counter
import tempfile
import pygame
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, LSTM, Dropout, Dense
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Suppress specific pretty_midi warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [13]:
# Path to the root directory
root_dir = "selectedcomposers"

midi_data = []

def extract_midi_features(midi):
    notes = []
    velocities = []
    durations = []
    chord_sizes = []

    for instrument in midi.instruments:
        inst_notes = instrument.notes
        inst_notes.sort(key=lambda n: n.start)

        notes.extend(inst_notes)
        velocities.extend([n.velocity for n in inst_notes])
        durations.extend([n.end - n.start for n in inst_notes])

        i = 0
        while i < len(inst_notes):
            chord = [inst_notes[i]]
            j = i + 1
            while j < len(inst_notes) and abs(inst_notes[j].start - inst_notes[i].start) < 0.05:
                chord.append(inst_notes[j])
                j += 1
            chord_sizes.append(len(chord))
            i = j

    pitches = [n.pitch for n in notes]
    note_count = len(notes)
    duration = midi.get_end_time()

    return {
        'tempo': midi.estimate_tempo(),
        'num_instruments': len(midi.instruments),
        'duration': duration,
        'note_count': note_count,
        'avg_pitch': np.mean(pitches) if pitches else 0,
        'pitch_range': (max(pitches) - min(pitches)) if pitches else 0,
        'std_pitch': np.std(pitches) if pitches else 0,
        'most_common_pitch': Counter(pitches).most_common(1)[0][0] if pitches else 0,
        'avg_duration': np.mean(durations) if durations else 0,
        'std_duration': np.std(durations) if durations else 0,
        'note_density': note_count / duration if duration > 0 else 0,
        'velocity_mean': np.mean(velocities) if velocities else 0,
        'velocity_std': np.std(velocities) if velocities else 0,
        'avg_chord_size': np.mean(chord_sizes) if chord_sizes else 0,
        'chord_density': len(chord_sizes) / duration if duration > 0 else 0
    }


In [14]:
# Walk and extract all valid MIDI files
for subdir, dirs, files in os.walk(root_dir):
    for file in files:
        if file.endswith('.mid'):
            file_path = os.path.join(subdir, file)
            composer = os.path.relpath(file_path, root_dir).split(os.sep)[0]
            split = 'train'
            try:
                midi = pretty_midi.PrettyMIDI(file_path)
                features = extract_midi_features(midi)
                midi_data.append({
                    'split': split,
                    'composer': composer,
                    'filename': file,
                    **features
                })
            except Exception as e:
                print(f"Skipped {file_path}: {e}")

# Create DataFrame
midi_df = pd.DataFrame(midi_data)
print(f"Loaded {len(midi_df)} MIDI files.")
print(midi_df.head())

# Optional: Save
# midi_df.to_csv("midi_features_clean.csv", index=False)

Skipped selectedcomposers/Mozart/Piano Sonatas/Nueva carpeta/K281 Piano Sonata n03 3mov.mid: Could not decode key with 2 flats and mode 2
Skipped selectedcomposers/Beethoven/Anhang 14-3.mid: Could not decode key with 3 flats and mode 255
Loaded 1528 MIDI files.
   split composer                                  filename       tempo  \
0  train   Mozart            K495 Horn Concerto n4 1mov.mid  218.736624   
1  train   Mozart             K626 Requiem 05 Recordare.mid  159.249987   
2  train   Mozart  K492 Overture ''Le Nozze di Figaro''.mid  229.670316   
3  train   Mozart                  K427 Great Mass 1mov.mid  150.909091   
4  train   Mozart         K299 Flute Harp Concerto 3mov.mid  154.591365   

   num_instruments    duration  note_count  avg_pitch  pitch_range  std_pitch  \
0                8  404.032634        7009  62.901270           58  11.644100   
1               13  361.846184        3536  61.033654           57  11.257112   
2                8  304.218882       10671  

In [15]:
midi_df.shape

(1528, 18)

In [16]:
root_dir = "selectedcomposers"
chunk_size = 200
midi_data = []

def extract_chunk_features(notes, duration, num_instruments, tempo):
    pitches = [n.pitch for n in notes]
    velocities = [n.velocity for n in notes]
    durations = [n.end - n.start for n in notes]

    # Chord sizes
    notes.sort(key=lambda n: n.start)
    chord_sizes = []
    i = 0
    while i < len(notes):
        chord = [notes[i]]
        j = i + 1
        while j < len(notes) and abs(notes[j].start - notes[i].start) < 0.05:
            chord.append(notes[j])
            j += 1
        chord_sizes.append(len(chord))
        i = j

    return {
        'tempo': tempo,
        'num_instruments': num_instruments,
        'duration': duration,
        'note_count': len(notes),
        'avg_pitch': np.mean(pitches) if pitches else 0,
        'pitch_range': max(pitches) - min(pitches) if pitches else 0,
        'std_pitch': np.std(pitches) if pitches else 0,
        'most_common_pitch': Counter(pitches).most_common(1)[0][0] if pitches else 0,
        'avg_duration': np.mean(durations) if durations else 0,
        'std_duration': np.std(durations) if durations else 0,
        'note_density': len(notes) / duration if duration > 0 else 0,
        'velocity_mean': np.mean(velocities) if velocities else 0,
        'velocity_std': np.std(velocities) if velocities else 0,
        'avg_chord_size': np.mean(chord_sizes) if chord_sizes else 0,
        'chord_density': len(chord_sizes) / duration if duration > 0 else 0
    }

In [17]:
for subdir, dirs, files in os.walk(root_dir):
    for file in files:
        if file.endswith('.mid'):
            file_path = os.path.join(subdir, file)
            composer = os.path.relpath(file_path, root_dir).split(os.sep)[0]
            split = 'train'

            try:
                midi = pretty_midi.PrettyMIDI(file_path)
                tempo = midi.estimate_tempo()
                num_instruments = len(midi.instruments)

                all_notes = []
                for instrument in midi.instruments:
                    all_notes.extend(instrument.notes)

                if len(all_notes) < chunk_size:
                    continue  # skip short files

                # Sort notes chronologically
                all_notes.sort(key=lambda n: n.start)

                # Split into chunks of 200 notes
                for i in range(0, len(all_notes), chunk_size):
                    chunk_notes = all_notes[i:i+chunk_size]
                    if len(chunk_notes) < chunk_size:
                        break  # discard short last chunk

                    chunk_duration = chunk_notes[-1].end - chunk_notes[0].start
                    features = extract_chunk_features(chunk_notes, chunk_duration, num_instruments, tempo)

                    midi_data.append({
                        'split': split,
                        'composer': composer,
                        'filename': file,
                        **features
                    })

            except Exception as e:
                print(f"Skipped {file_path}: {e}")

# Convert to DataFrame
midi_df = pd.DataFrame(midi_data)
print(f"Created {len(midi_df)} chunked rows across all MIDI files.")
print(midi_df.head())

# Optional: Save to disk
# midi_df.to_csv("chunked_midi_features.csv", index=False)

Skipped selectedcomposers/Mozart/Piano Sonatas/Nueva carpeta/K281 Piano Sonata n03 3mov.mid: Could not decode key with 2 flats and mode 2
Skipped selectedcomposers/Beethoven/Anhang 14-3.mid: Could not decode key with 3 flats and mode 255
Created 23674 chunked rows across all MIDI files.
   split composer                        filename       tempo  \
0  train   Mozart  K495 Horn Concerto n4 1mov.mid  218.736624   
1  train   Mozart  K495 Horn Concerto n4 1mov.mid  218.736624   
2  train   Mozart  K495 Horn Concerto n4 1mov.mid  218.736624   
3  train   Mozart  K495 Horn Concerto n4 1mov.mid  218.736624   
4  train   Mozart  K495 Horn Concerto n4 1mov.mid  218.736624   

   num_instruments   duration  note_count  avg_pitch  pitch_range  std_pitch  \
0                8   8.749994         200     63.725           50  11.842693   
1                8   8.965511         200     65.105           52  13.510143   
2                8   5.646548         200     66.345           48  11.992747   
3

In [18]:
midi_df.shape

(23674, 18)

In [20]:

midi_df.to_csv("chunked_midi_features.csv", index=False)