# Chord Extraction from MIDI Files

This is a short script that extracts chords from MIDI files using the Chorder library (https://github.com/joshuachang2311/chorder)

As an example, I am using the The Lakh MIDI Dataset (https://colinraffel.com/projects/lmd/) and extracting from all songs in the Soul genre.

## Setup

In [None]:
# Install dependencies
!pip install miditoolkit
!pip install chorder
!pip install fuzzywuzzy[speedup]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting miditoolkit
  Downloading miditoolkit-0.1.16-py3-none-any.whl (20 kB)
Collecting mido>=1.1.16
  Downloading mido-1.2.10-py2.py3-none-any.whl (51 kB)
[K     |████████████████████████████████| 51 kB 2.4 MB/s 
[?25hInstalling collected packages: mido, miditoolkit
Successfully installed miditoolkit-0.1.16 mido-1.2.10
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting chorder
  Downloading chorder-0.1.4-py3-none-any.whl (10 kB)
Installing collected packages: chorder
Successfully installed chorder-0.1.4
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fuzzywuzzy[speedup]
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Collecting python-levenshtein>=0.12
  Downloading python-Levenshtein-0.12.2.tar.gz (50 kB)
[K     |████████████████████████████████| 5

In [None]:
# Import needed modules and create IO dirs
import os
import copy
import numpy as np
import multiprocessing as mp

import pandas as pd
import re

from chorder import Dechorder
import miditoolkit

from fuzzywuzzy import fuzz
from fuzzywuzzy import process

import pickle
import shutil
import sys

if not os.path.exists('./Input_MIDIs'):
  os.mkdir('./Input_MIDIs')
if not os.path.exists('./Chorded_MIDIs'):
  os.mkdir('./Chorded_MIDIs')

In [None]:
# Integers to note names
num2pitch = {
    0: 'C',
    1: 'C#',
    2: 'D',
    3: 'D#',
    4: 'E',
    5: 'F',
    6: 'F#',
    7: 'G',
    8: 'G#',
    9: 'A',
    10: 'A#',
    11: 'B',
}

## Data

In [None]:
# Download the dataset
!wget 'http://hog.ee.columbia.edu/craffel/lmd/clean_midi.tar.gz'
!tar -xvf 'clean_midi.tar.gz'
!rm 'clean_midi.tar.gz'

In [None]:
subdirs = list(os.walk('clean_midi'))[0][1]

In [None]:
# Load the list of Soul artists to filter the sub-directories of the downloaded dataset
soul = pd.read_csv('soul_artists.csv')
soul = soul['Aaliyah'].to_list()
regex = re.compile('[^a-zA-Z ]')
soul = [regex.sub('', x) for x in soul]

In [None]:
len(subdirs) # All sub-directories

2198

In [None]:
# Use fuzzy matching to 
soul_dirs = []
for artist in soul:
  x = process.extractOne(artist, subdirs)
  if x[1] > 85: #random threshold
    soul_dirs.append(x[0])

soul_dirs = list(set(soul_dirs))
len(soul_dirs) # only Soul sub-directories

242

In [None]:
soul_file_list = []
for direct in soul_dirs:
  tmp = list(os.listdir('clean_midi'+"/"+direct))
  tmp = ['clean_midi'+"/"+direct+"/"+x for x in tmp]
  soul_file_list.extend(tmp)

len(soul_file_list) # Number of MIDI files in the Soul genre

2515

## Chord Detection

In [None]:
def chordify_midi(path_infile, num):
  # load
  midi_obj = miditoolkit.midi.parser.MidiFile(path_infile)
  midi_obj_out = copy.deepcopy(midi_obj)
  notes = midi_obj.instruments[0].notes
  notes = sorted(notes, key=lambda x: (x.start, x.pitch))

  # exctract chord
  chords = Dechorder.dechord(midi_obj)
  chord_names = []
  for chord in chords:
    if chord.is_complete():
      chord_name = []
      chord_name.append(num2pitch[chord.root_pc])
      if chord.quality != 'M': #following the convention, don't indicate anything apart from the pitch for major triads
        chord_name.append(chord.quality)
      if num2pitch[chord.bass_pc] != num2pitch[chord.root_pc]: # in case the bass note is not the root note
        chord_name.append("/"+num2pitch[chord.bass_pc])
      chord_names.append(''.join(chord_name))

  # dedup
  prev_chord = None
  dedup_chords = []
  for m in chord_names:
      if m != prev_chord:
          prev_chord = m
          dedup_chords.append(m)

  # returning as separate text files to facilitate multi-thread processing
  textfile = open(f"Chorded_MIDIs/chords{num}.txt", "w")
  for element in dedup_chords:
    textfile.write(element + "\n")
  textfile.close()

In [None]:
# Run the function with multi-thread processing to speed thing up
data = []
for i, x in enumerate(soul_file_list):
  data.append([x, str(i)])

pool = mp.Pool()
pool.starmap(chordify_midi, data)

In [None]:
# Combine the separate text files into one big file
with open('soul_chords.txt','wb') as wfd:
    for f in list(os.listdir('Chorded_MIDIs')):
      if '.txt' in f:
        with open('Chorded_MIDIs/'+f,'rb') as fd:
            shutil.copyfileobj(fd, wfd)

In [None]:
import csv
# Combining it all for Pickle export
all_chords = []
for file in list(os.listdir('Chorded_MIDIs')):
  if '.txt' in file:
    chord_file = open('Chorded_MIDIs/'+file, 'r')
    reader = csv.reader(chord_file)
    allRows = [row for row in reader]
    all_chords.append(allRows)

In [None]:
with open('soul_chords.pickle', 'wb') as f:
    pickle.dump(all_chords, f)