In [38]:
import matplotlib.pyplot as plt
import mido
import numpy as np
import os
import pandas as pd
import seaborn as sns
import warnings
from music21 import chord, converter, instrument, note, pitch
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

## Data Collection
This section collects MIDI file paths and their corresponding composer labels from a specified dataset directory, organized into different splits (dev, test, train). It filters the files based on a predefined list of composers, appending the paths and labels to respective lists. Finally, it prints out the number of collected files and samples of file paths and labels for verification.

In [19]:
# Define the path to the dataset
dataset_path = 'Composer_Dataset/NN_midi_files_extended'

# Initialize lists to hold the data
data = []
labels = []

# List of composers to include
composers_to_include = ['bach', 'beethoven', 'chopin', 'mozart']

# Data Collection
for dataset_split in ['dev', 'test', 'train']:
    split_path = os.path.join(dataset_path, dataset_split)
    for composer_folder in os.listdir(split_path):
        if composer_folder.lower() in composers_to_include:
            composer_path = os.path.join(split_path, composer_folder)
            if os.path.isdir(composer_path):
                for midi_file in os.listdir(composer_path):
                    if midi_file.endswith('.mid'):
                        midi_path = os.path.join(composer_path, midi_file)
                        labels.append(composer_folder)
                        data.append(midi_path)

# Verify 
print(f"Collected {len(data)} MIDI files.")
print("Sample data paths:")
for path in data[:5]:
    print(f"  - {path}")
print("Sample labels:")
for label in labels[:5]:
    print(f"  - {label}")

Collected 148 MIDI files.
Sample data paths:
  - Composer_Dataset/NN_midi_files_extended/dev/mozart/mozart039.mid
  - Composer_Dataset/NN_midi_files_extended/dev/mozart/mozart035.mid
  - Composer_Dataset/NN_midi_files_extended/dev/mozart/mozart020.mid
  - Composer_Dataset/NN_midi_files_extended/dev/mozart/mozart040.mid
  - Composer_Dataset/NN_midi_files_extended/dev/chopin/chopin069.mid
Sample labels:
  - mozart
  - mozart
  - mozart
  - mozart
  - chopin


## Data Pre-processing
This section preprocesses MIDI files by extracting note and chord information, handling potential parsing errors and ignoring warnings. It processes the files in batches for better debugging and tracks successfully processed files as well as those that failed. The script prints the progress, number of successfully processed files, and logs any files that encountered errors.

In [23]:
def preprocess_midi(file_path):
    """
    Preprocess a MIDI file to extract note, chord, and tempo information.
    """
    try:
        with warnings.catch_warnings():
            warnings.simplefilter('ignore', category=UserWarning)
            midi = converter.parse(file_path)
    except Exception as e:
        print(f"Error parsing {file_path}: {e}")
        return []

    notes = []
    for element in midi.flat.notes:
        if isinstance(element, note.Note):
            notes.append(str(element.pitch))
        elif isinstance(element, chord.Chord):
            notes.append('.'.join(str(n) for n in element.normalOrder))
        else:
            continue  # Skip elements that are not notes or chords

    return notes

# Apply preprocessing
preprocessed_data = []
failed_files = []

# Process in smaller batches for debugging
batch_size = 10  # Adjust this value as needed
for i, file_path in enumerate(data):
    if i % batch_size == 0:
        print(f"Processing batch {i//batch_size + 1}...")

    preprocessed = preprocess_midi(file_path)
    if preprocessed:
        preprocessed_data.append(preprocessed)
    else:
        failed_files.append(file_path)

    # Print progress every 10 files
    if (i + 1) % 10 == 0:
        print(f"Processed {i + 1} files...")

# Verify
print(f"Processed {len(preprocessed_data)} MIDI files successfully.")
print(f"Failed to process {len(failed_files)} MIDI files.")
print(f"Sample preprocessed data: {preprocessed_data[:1]}")

# Log failed files
if failed_files:
    print("Failed files:")
    for file_path in failed_files:
        print(file_path)

Processing batch 1...
Processed 10 files...
Processing batch 2...
Error parsing Composer_Dataset/NN_midi_files_extended/test/mozart/mozart025.mid: 5512297104
Processed 20 files...
Processing batch 3...
Processed 30 files...
Processing batch 4...
Processed 40 files...
Processing batch 5...
Error parsing Composer_Dataset/NN_midi_files_extended/train/mozart/mozart009.mid: 4991915680
Processed 50 files...
Processing batch 6...
Processed 60 files...
Processing batch 7...
Processed 70 files...
Processing batch 8...
Processed 80 files...
Processing batch 9...
Processed 90 files...
Processing batch 10...
Processed 100 files...
Processing batch 11...
Processed 110 files...
Processing batch 12...
Processed 120 files...
Processing batch 13...
Processed 130 files...
Processing batch 14...
Processed 140 files...
Processing batch 15...
Processed 146 MIDI files successfully.
Failed to process 2 MIDI files.
Sample preprocessed data: [['A4', 'A5', 'F#5', '2.4', '2.6', 'C#5', 'D5', '2.4', '7.11', 'C#5',

## Feature Extraction
This section extracts numerical features from the preprocessed MIDI notes and chords by converting them into MIDI pitch values. It handles both individual notes and chords, skipping invalid or empty data and logging any errors encountered. The script then prints the total number of processed files and provides a sample of the extracted features for verification.

In [25]:
# Step 3: Feature Extraction
def extract_features(notes):
    """
    Extract features from the notes and chords in a piece of music.
    """
    features = []

    # Convert notes and chords into numerical representation
    for n in notes:
        if n == '':
            continue  # Skip empty strings
        try:
            if '.' in n:
                chord_notes = n.split('.')
                if all(c.isdigit() for c in chord_notes):
                    chord_notes = [pitch.Pitch(int(c)).midi for c in chord_notes]
                    features.append(chord_notes)
                else:
                    print(f"Invalid chord encountered: {n}")
            else:
                if n.isdigit():
                    features.append(pitch.Pitch(int(n)).midi)
                else:
                    features.append(pitch.Pitch(n).midi)
        except pitch.PitchException:
            print(f"Invalid pitch encountered: {n}")
            continue  # Skip invalid pitch data

    return features

# Apply feature extraction
features = [extract_features(notes) for notes in preprocessed_data]

# Verify Step 3
print(f"Extracted features from {len(features)} MIDI files.")
print(f"Sample features: {features[:1]}")

Extracted features from 146 MIDI files.
Sample features: [[69, 81, 78, [62, 64], [62, 66], 73, 74, [62, 64], [67, 71], 73, 74, [62, 66, 69], [61, 64], 67, 79, 76, [61, 62], [64, 69], 71, 73, 69, [61, 64], 76, 81, [62, 64, 67, 69], [62, 66], 74, 78, 50, 62, 79, 64, [69, 71], 66, [66, 67], [62, 64], 74, 50, 78, 62, 79, 64, [69, 71], 66, [66, 67], [62, 64], 74, 83, 55, 67, 85, 64, [61, 62], 66, [69, 71], [66, 67], [69, 71], 67, [66, 67], [62, 64], [69, 61, 64], 69, 69, 81, 78, [62, 64], [62, 66], 73, 74, [62, 64], [67, 71], 73, 74, [62, 66, 69], [61, 64], 67, 79, 76, [69, 61, 62, 64], 71, 73, [69, 61, 64], 76, 81, [62, 64, 67, 69], [62, 66], 74, 78, 50, 62, 79, 64, [69, 71], 66, [66, 67], [62, 64], 79, 43, 83, 55, 85, 57, [62, 64], 59, [71, 61], [67, 69], [71, 61], [64, 67, 71], [67, 69], [64, 66], [69, 71], [66, 67], [62, 64], [67, 69], [64, 66], [61, 62], 74, [62, 66], [62, 66], [62, 66], 69, 45, [66, 69], [66, 69], 74, 50, [69, 62], [69, 62], 78, 54, [62, 66], [62, 66], 81, 57, [62, 66

## Convert Features and Labels into a DataFrame
This section creates a DataFrame from the features and labels, ensuring that both arrays have the same length by trimming the longer array. It prints the lengths of the features and labels, adjusts them if necessary, and then constructs and verifies the DataFrame to ensure proper alignment of the data.

In [27]:
# Step 4: Convert features and labels into a DataFrame
# Example arrays (replace these with your actual data)
features = [[1, 2], [3, 4], [5, 6]]
labels = [0, 1, 0, 1]  # Different length from features

# Check lengths
features_length = len(features)
labels_length = len(labels)

print(f"Length of features: {features_length}")
print(f"Length of labels: {labels_length}")

# Make the lengths equal by trimming the longer array
if features_length > labels_length:
    features = features[:labels_length]
elif labels_length > features_length:
    labels = labels[:features_length]

# Now create the DataFrame
df = pd.DataFrame({'features': features, 'label': labels})

# Verify the DataFrame
print(df.head())

Length of features: 3
Length of labels: 4
  features  label
0   [1, 2]      0
1   [3, 4]      1
2   [5, 6]      0


## Explatory Data Analysis - in progress

In [37]:
# Print information about the original dataframe
print("Shape of original dataframe:", df.shape)
print("\nColumns in original dataframe:")
print(df.columns)
print("\nSample of 'features' column:")
print(df['features'].head())

# Function to extract more advanced features
def extract_advanced_features(feature_list):
    if not isinstance(feature_list, (list, np.ndarray)):
        print(f"Unexpected type for feature_list: {type(feature_list)}")
        return pd.Series()  # Return an empty Series if input is invalid
    
    feature_series = pd.Series(feature_list)
    return pd.Series({
        'mean': feature_series.mean(),
        'std': feature_series.std(),
        'median': feature_series.median(),
        'min': feature_series.min(),
        'max': feature_series.max(),
        'range': feature_series.max() - feature_series.min(),
        'unique_notes': feature_series.nunique(),
        'mode': feature_series.mode().iloc[0] if not feature_series.mode().empty else np.nan,
        'skewness': feature_series.skew(),
        'kurtosis': feature_series.kurtosis(),
        'q1': feature_series.quantile(0.25),
        'q3': feature_series.quantile(0.75),
        'iqr': feature_series.quantile(0.75) - feature_series.quantile(0.25),
    })

# Apply advanced feature extraction
advanced_features = df['features'].apply(extract_advanced_features)

print("\nShape of advanced_features:")
print(advanced_features.shape)
print("\nColumns in advanced_features:")
print(advanced_features.columns)

print("\nNaN or infinite values in advanced features:")
print(advanced_features.isna().sum())
print("\nInfinite values:")
print(np.isinf(advanced_features).sum())

# Combine with original dataframe
df_advanced = pd.concat([df, advanced_features], axis=1)

print("\nShape of df_advanced:", df_advanced.shape)
print("\nColumns in df_advanced:")
print(df_advanced.columns)

Shape of original dataframe: (3, 3)

Columns in original dataframe:
Index(['features', 'label', 'label_encoded'], dtype='object')

Sample of 'features' column:
0    [1, 2]
1    [3, 4]
2    [5, 6]
Name: features, dtype: object

Shape of advanced_features:
(3, 13)

Columns in advanced_features:
Index(['mean', 'std', 'median', 'min', 'max', 'range', 'unique_notes', 'mode',
       'skewness', 'kurtosis', 'q1', 'q3', 'iqr'],
      dtype='object')

NaN or infinite values in advanced features:
mean            0
std             0
median          0
min             0
max             0
range           0
unique_notes    0
mode            0
skewness        3
kurtosis        3
q1              0
q3              0
iqr             0
dtype: int64

Infinite values:
mean            0
std             0
median          0
min             0
max             0
range           0
unique_notes    0
mode            0
skewness        0
kurtosis        0
q1              0
q3              0
iqr             0
dtype: in