In [1]:
from dataclasses import dataclass

import numpy as np
import pandas as pd

# Helper class
@dataclass
class Note:
    note: int = 0       # The pitch
    octave: int = 0     # The (Midi) octave
    rest: int = 0       # Whether the note is a rest
    duration: int = 0   # The duration of the note (in 1/16 second intervals)

# Options for creating the dataframe. VOICE = which voice to pick, LAG = how many periods to lag for.
VOICE = 3
LAG = 8

# Import the csv
piano_input = pd.read_csv('F.txt', sep='\t', header=None)

# Transform piano key pitch (A0 = 1) to Midi pitch (A0 = 21)
# Different sources say different numbers for A0. Might be 9 or 21. :| 9 seems to be A(-1) generally though.
# If we map octave -1 to index 0 (which is the easiest probably), then we need A0 = 21.
midi_input = piano_input.where(piano_input == 0, piano_input + 20)
# Take the voice we are working on
voice_input = midi_input.iloc[:, VOICE]

# Create an output dataframe
notes = []
# Process the first note
cur = Note(voice_input[0] % 12, voice_input[0] // 12, 1 if voice_input[0] == 0 else 0, 1)

# Process the rest of the notes
for entry in voice_input[1:]:
    note = entry % 12
    octv = entry // 12
    if cur.note == note and cur.octave == octv:
        cur.duration += 1
    else:
        notes.append(cur)
        cur = Note(note, octv, 1 if note == 0 and octv == 0 else 0, 1)

# Pandas can automatically convert a list of dataclass objects to dataframes!
notes_df = pd.DataFrame(notes)

# Create a lagged input dataframe
notes_lagged_df = notes_df.copy()

for lag in range(1, LAG + 1):
    lagged = notes_df.shift(lag)
    lagged.columns = [f'{col_name}_lag{lag}' for col_name in notes_df.columns]
    notes_lagged_df = pd.concat((notes_lagged_df, lagged), axis=1)

# Drop rows containing NA (i.e. the first LAG rows basically)
notes_lagged_df = notes_lagged_df.dropna()

# Show first 10 notes
notes_lagged_df.head(10)

Unnamed: 0,note,octave,rest,duration,note_lag1,octave_lag1,rest_lag1,duration_lag1,note_lag2,octave_lag2,...,rest_lag6,duration_lag6,note_lag7,octave_lag7,rest_lag7,duration_lag7,note_lag8,octave_lag8,rest_lag8,duration_lag8
8,5,5,0,12,2.0,5.0,0.0,8.0,9.0,5.0,...,0.0,12.0,2.0,5.0,0.0,8.0,0.0,0.0,1.0,8.0
9,11,4,0,4,5.0,5.0,0.0,12.0,2.0,5.0,...,0.0,4.0,9.0,5.0,0.0,12.0,2.0,5.0,0.0,8.0
10,4,5,0,8,11.0,4.0,0.0,4.0,5.0,5.0,...,0.0,16.0,7.0,5.0,0.0,4.0,9.0,5.0,0.0,12.0
11,9,4,0,4,4.0,5.0,0.0,8.0,11.0,4.0,...,0.0,16.0,5.0,5.0,0.0,16.0,7.0,5.0,0.0,4.0
12,4,5,0,4,9.0,4.0,0.0,4.0,4.0,5.0,...,0.0,16.0,7.0,5.0,0.0,16.0,5.0,5.0,0.0,16.0
13,9,5,0,4,4.0,5.0,0.0,4.0,9.0,4.0,...,0.0,8.0,9.0,5.0,0.0,16.0,7.0,5.0,0.0,16.0
14,7,5,0,4,9.0,5.0,0.0,4.0,4.0,5.0,...,0.0,12.0,2.0,5.0,0.0,8.0,9.0,5.0,0.0,16.0
15,5,5,0,4,7.0,5.0,0.0,4.0,9.0,5.0,...,0.0,4.0,5.0,5.0,0.0,12.0,2.0,5.0,0.0,8.0
16,2,5,0,4,5.0,5.0,0.0,4.0,7.0,5.0,...,0.0,8.0,11.0,4.0,0.0,4.0,5.0,5.0,0.0,12.0
17,10,5,0,4,2.0,5.0,0.0,4.0,5.0,5.0,...,0.0,4.0,4.0,5.0,0.0,8.0,11.0,4.0,0.0,4.0
