# Voice leading reinforcement learning agents.

## Introduction.
[...]

$$
\mathbb{E}[\text{return}|\alpha, s]
\ =\ 
\text{reward}+\underset{\ \beta\ \in\ \mathcal{A}_{\alpha(s)}\!\!}{\text{max}}\mathbb{E}\left[\text{return}|\beta, \alpha(s)\right]
$$

$$
v(\alpha, s)
\ =\ 
R(\alpha)+\underset{\ \beta\ \in\ \mathcal{A}_{\alpha(s)}\!\!}{\text{max}}v\big(\beta, \alpha(s)\big)
$$

In [1]:
import copy

import random
import math
import numpy as np
from inspect import isfunction

import torch
import torch.nn as nn

from tqdm import tqdm

___

## Classes for various aspects of music theory.
The various Python classes we define in this section collect important aspects of music theory relevant to problem of voice leading. Using MIDI standard encoding for instance, every note in the scale can be assigned an integer value between $0$ and $127$. In this way, a solution to any voice leading problem can be encoded completely numerically. However, the reward functions for the sequence of step-by-step actions that constitute a proposed solution to a voice leading problem depend on musical theoretical considerations. We will use the classes we define in the present section in order to evaluation remards for our agent's actions.

### Classes related to harmony and melody.

#### Class: `Notes`
Parent(s): *none*

Constructor arguments: *none*

In [2]:
class Notes():
    def __init__(self):
        
        self.all_note_class_names = ['C',
                                     'C♯','D♭',
                                     'D',
                                     'D♯','E♭',
                                     'E',
                                     'F',
                                     'F♯','G♭',
                                     'G',
                                     'G♯','A♭',
                                     'A',
                                     'A♯','B♭',
                                     'B']
        
        note_values = [value for value in range(57, 82)]
        self.note_values = note_values
    
        degree_notes = {0: ('C', 'C'),
                        1: ('C♯', 'D♭'),
                        2: ('D', 'D'),
                        3: ('D♯', 'E♭'),
                        4: ('E', 'E'),
                        5: ('F', 'F'),
                        6: ('F♯', 'G♭'),
                        7: ('G', 'G'),
                        8: ('G♯', 'A♭'),
                        9: ('A', 'A'),
                        10: ('A♯', 'B♭'),
                        11: ('B', 'B')}
        self.degree_notes = degree_notes
        
        class_name_to_degree = {'C': 0,
                                'C♯': 1,
                                'D♭': 1,
                                'D':2,
                                'D♯':3,
                                'E♭':3,
                                'E':4,
                                'F':5,
                                'F♯':6,
                                'G♭':6,
                                'G':7,
                                'G♯':8,
                                'A♭':8,
                                'A':9,
                                'A♯':10,
                                'B♭':11,
                                'B':12}
        self.class_name_to_degree = class_name_to_degree
        
        values_to_notes = {}
        for value in self.note_values:
            degree = value%12
            octave = int((value - degree)/12)-1
            sharp_name = degree_notes[degree][0]
            flat_name = degree_notes[degree][1]
            values_to_notes.update({value: (sharp_name + str(octave),
                                            flat_name + str(octave))})
        self.values_to_notes = values_to_notes
        
        sharps_to_values = {}
        flats_to_values = {}
        for key, value in self.values_to_notes.items():
            note_value = key
            sharp_note = value[0]
            flat_note = value[1]
            sharps_to_values.update({sharp_note: copy.deepcopy(note_value)})
            flats_to_values.update({flat_note: copy.deepcopy(note_value)})
        self.sharps_to_values = sharps_to_values
        self.flats_to_values = flats_to_values
        
        all_note_names = []
        for key in self.sharps_to_values:
            if key in all_note_names:
                pass
            else:
                all_note_names.append(key)
        for key in self.flats_to_values:
            if key in all_note_names:
                pass
            else:
                all_note_names.append(key)
        self.all_note_names = all_note_names
        
        notes_to_values = {}
        for name in self.all_note_names:
            if name in self.sharps_to_values:
                note_value = self.sharps_to_values[name]
                notes_to_values.update({name: copy.deepcopy(note_value)})
            else:
                note_value = self.flats_to_values[name]
                notes_to_values.update({name: copy.deepcopy(note_value)})
        self.notes_to_values = notes_to_values

Testing:

In [3]:
notes = Notes()
notes.values_to_notes

{57: ('A3', 'A3'),
 58: ('A♯3', 'B♭3'),
 59: ('B3', 'B3'),
 60: ('C4', 'C4'),
 61: ('C♯4', 'D♭4'),
 62: ('D4', 'D4'),
 63: ('D♯4', 'E♭4'),
 64: ('E4', 'E4'),
 65: ('F4', 'F4'),
 66: ('F♯4', 'G♭4'),
 67: ('G4', 'G4'),
 68: ('G♯4', 'A♭4'),
 69: ('A4', 'A4'),
 70: ('A♯4', 'B♭4'),
 71: ('B4', 'B4'),
 72: ('C5', 'C5'),
 73: ('C♯5', 'D♭5'),
 74: ('D5', 'D5'),
 75: ('D♯5', 'E♭5'),
 76: ('E5', 'E5'),
 77: ('F5', 'F5'),
 78: ('F♯5', 'G♭5'),
 79: ('G5', 'G5'),
 80: ('G♯5', 'A♭5'),
 81: ('A5', 'A5')}

#### Class: `Scales`
Parent(s): `Notes`

Constructor arguments: *none*

In [4]:
class Scales(Notes):
    def __init__(self):
        super().__init__()
        
        # Construct modern mode degrees, ascending and descending, as attributes:
        self.long_step_sequence = [2, 2, 1, 2, 2, 2, 1, 2, 2, 1, 2, 2, 2]
        
        self.mode_start = {'Ionian': 0,
            'Dorian': 1,
            'Phrygian': 2,
            'Lydian': 3,
            'Mixolydian': 4,
            'Aeolian': 5,
            'Locrian': 6}
        
        modern_mode_steps = {}
        for key, value in self.mode_start.items():
            mode = key
            start_position = value
            current_mode_steps = [self.long_step_sequence[i] for i in range(start_position, start_position+7)]
            modern_mode_steps.update({mode: current_mode_steps})
        self.modern_mode_steps = modern_mode_steps
        
        updown_mode_degrees = {}
        for key, value in self.modern_mode_steps.items():
            mode = key
            step_sequence = value
            degree_sequence = [0]
            for i, step in enumerate(step_sequence):
                scale_degree = degree_sequence[i]
                new_scale_degree = (scale_degree + step)%12
                degree_sequence.append(new_scale_degree)
                rev_degree_sequence = degree_sequence[::-1]
            updown_mode_degrees.update({mode: {'up': copy.deepcopy(degree_sequence),
                                     'down': copy.deepcopy(rev_degree_sequence)}})
        
        # Construct Major mode degrees, ascending and descending, as attributes:
        major_updown = updown_mode_degrees['Ionian']
        updown_mode_degrees.update({'Major': copy.deepcopy(major_updown)})

        # Construct Natural minor mode degrees, ascending and descending, as attributes:
        natural_minor_updown = updown_mode_degrees['Aeolian']
        updown_mode_degrees.update({'Natural_minor': copy.deepcopy(natural_minor_updown)})

        # Construct Harmonic minor mode degrees, ascending and descending, as attributes:
        harmonic_minor_steps = [2, 1, 2, 2, 1, 3, 1]
        harmonic_minor_degree_sequence = [0]
        for i, step in enumerate(harmonic_minor_steps):
            scale_degree = harmonic_minor_degree_sequence[i]
            new_scale_degree = (scale_degree + step)%12
            harmonic_minor_degree_sequence.append(new_scale_degree)
            rev_harmonic_minor_degree_sequence = harmonic_minor_degree_sequence[::-1]
        updown_mode_degrees.update({'Harmonic_minor': {'up': copy.deepcopy(harmonic_minor_degree_sequence),
                                                     'down': copy.deepcopy(rev_harmonic_minor_degree_sequence)}})

        # Construct Melodic minor mode degrees, ascending and descending, as attributes:
        melodic_minor_steps_up = [2, 2, 1, 2, 2, 2, 1]
        melodic_minor_degrees_up = [0]
        for i in range(7):
            current_degree = melodic_minor_degrees_up[i]
            next_degree = (current_degree + melodic_minor_steps_up[i])%12
            melodic_minor_degrees_up.append(next_degree)
        melodic_minor_steps_down = [2, 2, 1, 2, 1, 2, 2]
        melodic_minor_degrees_down = [0]
        for i in range(7):
            current_degree = melodic_minor_degrees_down[i]
            next_degree = (current_degree - melodic_minor_steps_down[i])%12
            melodic_minor_degrees_down.append(next_degree)
        updown_mode_degrees.update({'Melodic_minor': {'up': copy.deepcopy(melodic_minor_degrees_up),
                                                    'down': copy.deepcopy(melodic_minor_degrees_down)}})

        # Combine all ascending and descending mode degrees into attribute dictionary:
        self.updown_mode_degrees = updown_mode_degrees

        # Collect all modes constructed as list attribute:
        mode_list = [key for key in self.updown_mode_degrees]
        self.mode_list = mode_list
 
    # Method for querying the ascending/descending mode degree dictionary attribute:
    def updown_degrees(self, mode):
        assert mode in self.mode_list
        output = self.updown_mode_degrees[mode]
        return output
        

Testing:

In [5]:
scales = Scales()

#### Class: `Key`
Parent(s): `Scales` ≺ `Notes`

Constructor arguments:
* *root* = `'C'`, 
* *mode* = `'Major'`

In [6]:
class Key(Scales):
    def __init__(self,
                 root = 'C',
                 mode = 'Major'):
        super().__init__()
        super().updown_degrees(mode)
        
        assert root in self.all_note_class_names
        assert mode in self.mode_list
        
        self.root_class_name = root
        self.root_class_degree = self.class_name_to_degree[self.root_class_name]
        self.mode = mode
        
    # Method for querying the ascending/descending mode degree dictionary attribute from the parent `Scales` class:
    def scale_degrees(self):
        output = self.updown_degrees(mode = self.mode)
        return output
    
    # Method for outputing the (upward) triad, in degree classes, for our Key:
    def scale_triad(self):
        up_degrees = self.scale_degrees()['up']
        output = [up_degrees[i] for i in [0,2,4]]
        return output

Testing:

In [7]:
key = Key(root = 'E', mode = 'Melodic_minor')
key.scale_triad()

[0, 4, 7]

### Classes related to temporal aspects of music theory, such as time signature and rhythm.

#### Class: `TimeSignature`
Parent(s): *none*

Constructor arguments:
* *numerator* = `4`,
* *denominator* = `4`

In [8]:
class TimeSignature():
    def __init__(self,
                 numerator = 4,
                 denominator = 4):
        
        assert isinstance(numerator, int) and isinstance(denominator, int)
        assert (numerator > 0) and (denominator > 0)
        
        self.counts_per_measure = numerator
        self.count_duration = 1/denominator
        self.measure_duration = self.counts_per_measure * self.count_duration

Testing:

In [9]:
time_signature = TimeSignature()

#### Class: `AccentPattern`
Parent(s): `TimeSignature`

Constructor arguments:
* *numerator* = `4`,
* denominator = `4`,
* *accent_pattern* = `[[2,1,1,1],[2,1,1,2]]`

In [10]:
class AccentPattern(TimeSignature):
    def __init__(self,
                 numerator = 4,
                 denominator = 4,
                 accent_pattern = [[2,1,1,1],[2,1,1,2]]):
        
        super().__init__(numerator = numerator,
                         denominator = denominator)
        
        assert isinstance(accent_pattern, list)
        for bar in accent_pattern:
            assert isinstance(bar, list)
            assert len(bar) == numerator
            for entry in bar:
                assert isinstance(entry, int)
                assert 1 <= entry <= 2
                
        self.accent_pattern = accent_pattern
        self.bar_count = len(self.accent_pattern)
        self.beats_per_measure = numerator
        self.total_beat_count = self.bar_count * self.beats_per_measure
        
        for i, bar in enumerate(self.accent_pattern):
            attribute_name = 'bar_{}_pattern'.format(i)
            setattr(AccentPattern, attribute_name, bar)

Testing:

In [11]:
accent_pattern = AccentPattern()

**Note:** Other classes of this sort will introduce the idea of rests in the voice leading texture, and other note patterns for more complex species.

___

## Classes for rewards.

### Classes for vertical, i.e., harmony-based rewards.

#### Class: `ConsonanceScheme`
Parent(s): *none*

Constructor arguments:
* *transform* = `standard_transform`, where *standard_transform* = `lambda x: np.pi + np.log(x)`

In [12]:
class ConsonanceScheme():
    standard_transform = lambda x: np.pi + np.log(x)
    
    def __init__(self,
                 transform = standard_transform):
        
        assert isfunction(transform)
        
        interval_to_name = {0: 'unison',
                            1: 'minor_second',
                            2: 'major_second',
                            3: 'minor_third',
                            4: 'major_third',
                            5: 'perfect_fourth',
                            6: 'tritone',
                            7: 'perfect_fifth',
                            8: 'minor_sixth',
                            9: 'major_sixth',
                            10: 'minor_seventh',
                            11: 'major_seventh'}
        self.interval_to_name = interval_to_name
        
        interval_to_height = {0: 1,
                               1: 1/15,
                               2: 1/9,
                               3: 1/15,
                               4: 1/5,
                               5: 1/3,
                               6: 1/35,
                               7: 1/3,
                               8: 1/5,
                               9: 1/15,
                               10: 1/9,
                               11: 1/15}
        self.interval_to_height = interval_to_height
        
        scale_degrees = [i for i in range(12)]
        intervals = []
        for i in scale_degrees:
            for j in scale_degrees:
                interval_class = [i,j]
                interval_class.sort()
                if interval_class not in intervals:
                    intervals.append(interval_class)
        
        self.intervals = intervals
        
        interval_rewards = {}
        transformed_interval_rewards = {}
        for key in self.intervals:
            interval = key[1]-key[0]
            height = interval_to_height[interval]
            interval_rewards.update({(key[0], key[1]): copy.deepcopy(height)})
            transformed_interval_rewards.update({(key[0], key[1]): transform(height)})
            transformed_interval_rewards.update({(key[1], key[0]): transform(height)})
        self.interval_rewards = interval_rewards
        self.transformed_interval_rewards = transformed_interval_rewards
        
        
    def consonance_reward(self, interval_pair_class):
        # IMPORTANT: The argument interval_pair_class here is
        # the class (modulo 12) of a numerical interval.
        
        output = self.transformed_interval_rewards[interval_pair_class]
        
        return output

Testing:

In [13]:
consonance_scheme = ConsonanceScheme()
consonance_scheme.consonance_reward((1,5))

1.5321547411556928

### Classes for horizontal, i.e., melody-based rewards.

#### Class: `ZeroStepScheme`
Parent(s): `Key` ≺ `Scales` ≺ `Notes`

Constructor arguments:
* *root* = `'C'`, 
* *mode* = `'Major'`

In [14]:
class ZeroStepScheme(Key):
    def __init__(self,
                 root = 'C',
                 mode = 'Major'):
        Key.__init__(self,
                     root = root,
                     mode = mode)
        
        self.up_or_down = {1: 'up', 0: 'up', -1: 'down'}
    
    def scale_degree_reward(self,
               interval_pair_0,
               interval_pair_1):
        # IMPORTANT: The argument interval_pair_class here is a numerical interval,
        # NOT the corresponding class.
        
        root_degree = self.root_class_degree
        relative_degree_0 = (interval_pair_1[0]-root_degree)%12
        relative_degree_1 = (interval_pair_1[1]-root_degree)%12
        
        step_0 = interval_pair_1[0] - interval_pair_0[0]
        step_1 = interval_pair_1[1] - interval_pair_0[1]
        
        sign_0 = np.sign(step_0)
        sign_1 = np.sign(step_1)
        
        direction_0 = self.up_or_down[sign_0]
        direction_1 = self.up_or_down[sign_1]
        
        scale_degrees_0 = self.scale_degrees()[direction_0]
        scale_degrees_1 = self.scale_degrees()[direction_1]
        
        in_scale_query_0 = int(relative_degree_0 in scale_degrees_0)
        in_scale_query_1 = int(relative_degree_1 in scale_degrees_1)
        
        output = in_scale_query_0 + in_scale_query_1
        
        return output
    
    # Need to fix to give rewards for all scale degrees on accents,
    # with special attention to tonic on accents...
    def tonic_accents_reward(self,
                             interval_pair,
                             bar_number = 0,
                             beat_number = 0,
                             accent_pattern = AccentPattern()):
        # IMPORTANT: The argument interval_pair_class here is a numerical interval,
        # NOT the corresponding class.
        
        accent = accent_pattern.accent_pattern[bar_number][beat_number]
        
        reward = 0
        for value in interval_pair:
            root_degree = self.root_class_degree
            tonic_query = int(0 == (value - root_degree)%12)
            reward += tonic_query * accent
            
        return reward

    
    def extrema_triad_reward(self,
                             interval_sequence = [(73, 84), (67, 81), (71, 77), (75, 79)]):
        # IMPORTANT: The argument interval_pair_class here is a numerical interval,
        # NOT the corresponding class.
        
        lower_line = [pair[0] for pair in interval_sequence]
        upper_line = [pair[1] for pair in interval_sequence]
        
        root_degree = self.root_class_degree
        
        lower_min_degree_class = (min(lower_line) - root_degree)%12
        lower_max_degree_class = (max(lower_line) - root_degree)%12
        upper_min_degree_class = (min(upper_line) - root_degree)%12
        upper_max_degree_class = (max(upper_line) - root_degree)%12
        
        triad = self.scale_triad()
        extrema_degrees = [lower_min_degree_class,
                           lower_max_degree_class,
                           upper_min_degree_class,
                           upper_max_degree_class]
        
        output = 0
        for degree in extrema_degrees:
            if degree in triad:
                output += 1
        
        return output

Testing:

In [15]:
zero_step_scheme = ZeroStepScheme()
print(zero_step_scheme.extrema_triad_reward() == 2)
print(zero_step_scheme.root_class_name)

True
C


#### Class: `StepScheme`
Parents(s): `ConsonanceScheme`

Constructor arguments:
* *consonance_transform* = `identity_transform`, where *identity_transform* = `lambda x: x`
* *step_weight* = `strandard_step_weight`, where *strandard_step_weight* = `lambda x: (12-x)/12`

In [16]:
class StepScheme(ConsonanceScheme):
    identity_transform = lambda x: x
    strandard_step_weight = lambda x: np.sign(x) * (4/(1 + x**2))
    
    def __init__(self,
                 consonance_transform = identity_transform,
                 step_weight = strandard_step_weight):
        super().__init__(consonance_transform)
        
        assert isfunction(step_weight)
        
        self.step_weight = step_weight
        
        
    def step_reward(self,
               interval_pair_0,
               interval_pair_1):
        # IMPORTANT: The argument interval_pair_class here is a numerical interval,
        # NOT the corresponding class.
        
        step_0 = interval_pair_1[0] - interval_pair_0[0]
        step_1 = interval_pair_1[1] - interval_pair_0[1]
        
        abs_step_0 = abs(step_0)
        abs_step_1 = abs(step_1)
        
        weight_0 = self.step_weight(abs_step_0)
        weight_1 = self.step_weight(abs_step_1)
        
        consonance_reward_0 = self.consonance_reward((0, abs_step_0%12))
        consonance_reward_1 = self.consonance_reward((0, abs_step_1%12))
        
        consonancescaled_weight_0 = weight_0 * consonance_reward_0
        consonancescaled_weight_1 = weight_1 * consonance_reward_1
        
        total_weight = consonancescaled_weight_0 + consonancescaled_weight_1
        
        return total_weight

Testing:

In [17]:
step_scheme = StepScheme()

#### Class: `TripleStepScheme`
Parents(s): `ConsonanceScheme`

Constructor arguments:

In [18]:
class TripleStepScheme(ConsonanceScheme):
    identity_transform = lambda x: x
    strandard_step_weight = lambda x: (12-x)/12
    
    def __init__(self,
                 consonance_transform = identity_transform,
                 step_weight = strandard_step_weight):
        super().__init__(consonance_transform)
        
    
    def within_octave_reward(self,
                             interval_sequence = [(73, 76), (64, 84), (51, 92)]):
        
        lower_line = [pair[0] for pair in interval_sequence]
        upper_line = [pair[1] for pair in interval_sequence]        
        
        lower_min_degree_class = min(lower_line)
        lower_max_degree_class = max(lower_line)
        upper_min_degree_class = min(upper_line)
        upper_max_degree_class = max(upper_line)
        
        lower_range = max(lower_line) - min(lower_line)
        upper_range = max(upper_line) - min(upper_line)
        
        output = -int(lower_range > 12) - int(upper_range > 12)
        
        return output

Testing:

In [19]:
triple_step_scheme = TripleStepScheme()
triple_step_scheme.within_octave_reward() == -2

True

### Classes for progress-to-final-interval rewards.

#### Class: `ProgtoFinScheme`
Parent(s): *none*

Constructor arguments:

In [20]:
class ProgtoFinScheme():
    def __init__(self):
        pass
        
    def prog_to_fin_reward(self,
                                 interval_0 = (73, 76),
                                 interval_1 = (72, 74),
                                 final_interval = (35, 45)):
        
        centroid_0 = (interval_0[0] + interval_0[1])/2
        centroid_1 = (interval_1[0] + interval_1[1])/2
        final_centroid = (final_interval[0] + final_interval[1])/2
        
        needed_change = final_centroid - centroid_0
        needed_direction = np.sign(needed_change)
        
        centroid_change = centroid_1 - centroid_0
        actual_direction = np.sign(centroid_change)
        signed_change = actual_direction * needed_direction
        
        scale_signed_change = 5 * signed_change
        
        return scale_signed_change

Testing:

In [21]:
prog_to_fin_scheme = ProgtoFinScheme()
print(prog_to_fin_scheme.prog_to_fin_reward() > 0.)
print(prog_to_fin_scheme.prog_to_fin_reward(interval_1 = (74, 77)) < 0.)

True
True


### Classes for rewards that mix horizontal & vertical aspects.

#### Class: `NonCrossScheme`
Parent(s): *none*

Constructor arguments:

In [22]:
class NonCrossScheme():
    def __init__(self):
        pass
        
    def non_cross_reward(self,
                         interval = (76, 73)):
        
        if interval[0] >= interval[1]:
            crossing_penalty = -1 - (interval[0] - interval[1])
        
        return crossing_penalty

Testing:

In [23]:
non_cross_scheme = NonCrossScheme()
non_cross_scheme.non_cross_reward()

-4

#### Class: `NonOverlapScheme`
Parent(s): *none*

Constructor arguments:

**Remark.** See the definition [here](https://en.wikipedia.org/wiki/Voice_crossing#Voice_overlapping). Similar to voice crossing, but more like "immediate displacement."

Testing:

#### Class: `NonParallelScheme`
Parent(s): `ConsonanceScheme`

Constructor arguments:

Constructor arguments:
* *transform* = `standard_transform`, where *standard_transform* = `lambda x: np.pi + np.log(x)`

In [24]:
class NonParallelScheme(ConsonanceScheme):
    standard_transform = lambda x: np.pi + np.log(x)
    
    def __init__(self,
                 transform = standard_transform):
        super().__init__(transform = transform)
        
#    def consonance_reward(self, pair):
#        super().consonance_reward(pair)
        
    def non_cross_reward(self,
                         interval_0 = (58, 61),
                         interval_1 = (63, 68)):
        
        lower_step = interval_1[0] - interval_0[0]
        upper_step = interval_1[1] - interval_0[1]
        
        if lower_step == upper_step:
            consonance_penalty = -self.consonance_reward((0, lower_step%12))
        else:
            consonance_penalty = 0
        
        return consonance_penalty

Testing:

In [25]:
non_parallel_scheme = NonParallelScheme()
non_parallel_scheme.non_cross_reward()

0

___

## Classes for voice leading states & actions.

### Classes for "first species" states & actions.

#### Class: `Interval`
Parent(s): *none*

Constructor arguments:
* *key* = `default_key`, where *default_key* = `Key(root = 'C', mode = 'Major')`,
* *note_pair* = `('G5', 'C6')`

In [26]:
class Interval():
    default_key = Key(root = 'C', mode = 'Major')
    
    def __init__(self,
                 key = default_key,
                 note_pair = ('A3', 'C4')):
        
        assert isinstance(key, Key)
        assert isinstance(note_pair, tuple)
        assert len(note_pair) == 2
        for entry in note_pair:
            assert isinstance(entry, str)
            assert entry in key.all_note_names
            
        self.key = key
        self.note_pair = note_pair
        
        note_values = [key.notes_to_values[name] for name in self.note_pair]
        self.note_values = tuple(note_values)
        self.note_pair_class = [(value-key.root_class_degree)%12 for value in self.note_values]

Testing:

In [27]:
interval = Interval()
print(interval.note_values)

(57, 60)


___

## Agent classes corresponding to different voice leading species.

### "First species" action-value neural network.

#### Function: `randinterval`
Arguments: *none*

In [28]:
def randinterval():
    lower_note_value = random.randrange(57, 81)
    note_difference = random.randrange(1, 11)
    while lower_note_value + note_difference > 81:
        note_difference = random.randrange(1, 11)
    
    output_interval = (lower_note_value, lower_note_value + note_difference)
    
    return output_interval

Testing:

In [29]:
randinterval()

(79, 81)

#### Class: `ActionValue_Spec1`
Parent(s): `torch.nn.Module`

Constructor arguments:
* *layer_count* = `6`
* *layer_features* = `1000`

**Remark: `ReLU` versus `Softmax`.** Because we're implicitly using the *greedy policy*, which, at each state $s$, always selects the action $\alpha$ that maximizes the action-value $v_{\text{greed}}(s,\alpha)$, it might appear that the neural network that approximates $v_{\text{greed}}(s,\alpha)$ should use *softmax* activation at its final layer. However, the specific value of $v_{\text{greed}}(s,\alpha)$ is also important. This activation function $v_{\text{greed}}(s,\alpha)$ is supposed to output the *excpected return* $\mathbb{E}_{\text{greed}}[G|\alpha,\pi]$, which is a (potentially weighted) sum of all future rewards that the agent will obtain under the greedy policy. Because we've already specified our rewards implicitly in the various reward functions we defined above, we will run into trouble if we use softmax. Indeed, $0\le \text{softmax}(x)\le 1$, whereas our reward functions can tske all sorts of integer values, sometimes negative. Thus is makes more sense to use `ReLU` or `LeakyReLU` for activation in our neural network.

In [54]:
class ActionValue_Spec1(nn.Module):
    def __init__(self, layer_count = 8, layer_features = 1000):
        super().__init__()
        
        self.layer_count = layer_count
        self.layer_features = layer_features
        
        self.layers = nn.ModuleList()
        
        self.layers.append(nn.Linear(in_features = 3 * 195, out_features = self.layer_features))
        for k in range(self.layer_count-2):
            self.layers.append(nn.Linear(in_features = self.layer_features, out_features = self.layer_features))
        self.layers.append(nn.Linear(in_features = self.layer_features, out_features = 1))
        
        self.activation = nn.LeakyReLU(negative_slope=0.01)
        
        possible_pair_indices = {}
        pair_counter = 0
        for i in range(57,81):
            for j in range(1,11):
                if i + j < 82:
                    possible_pair_indices.update({(i, i+j): copy.deepcopy(pair_counter)})
                    pair_counter += 1
        self.possible_pair_indices = possible_pair_indices
        print('\nNumber of pairs:', len(self.possible_pair_indices))
        
        indices_to_pairs = {}
        for pair in self.possible_pair_indices:
            index = self.possible_pair_indices[pair]
            indices_to_pairs.update({index: copy.deepcopy(pair)})
        self.indices_to_pairs = indices_to_pairs
        
    def forward(self, x):
        activated_features = x
        for layer in self.layers:
            output_features = layer(activated_features)
            activated_features = self.activation(output_features)
            
        return activated_features
    
    
    def max_arg(self, pair_0, final_pair):
        k = self.possible_pair_indices[pair_0]
        s = torch.tensor([float(i == k) for i in range(195)])
        
        n = self.possible_pair_indices[final_pair]
        f = torch.tensor([float(i == n) for i in range(195)])
        
        entry_0, entry_1 = pair_0[0], pair_0[1]

        possible_actions = []
        for lower_entry in range(57, 81):
            for upper_diff in range(1, 11):
                if lower_entry + upper_diff < 82:
                    possible_actions.append((lower_entry, lower_entry + upper_diff))

        max_val = -math.inf
        for pair in possible_actions:
            index = self.possible_pair_indices[pair]
            a = torch.tensor([float(j == index) for j in range(195)])
            model_input = torch.cat((s, a, f))
            model_output = self.forward(model_input)
            if model_output.item() > max_val:
                max_val = model_output.item()
                argument_that_maximizes = index
        return argument_that_maximizes
    
    
    def epsilon_greedy(self, epsilon, pair_0, final_pair):
        greedy_or_random = np.random.choice(['greedy', 'random'], p=[1-epsilon, epsilon])
        
        if greedy_or_random == 'greedy':
            state_index = self.max_arg(pair_0, final_pair)
        elif greedy_or_random == 'random':
            state_index = random.randrange(0, 195)
        
        return state_index 
    
    
    
    def double_max_arg(self, pair_0, final_pair):
        
        max_arg_1 = self.max_arg(pair_0, final_pair)
        pair_1 = self.indices_to_pairs[max_arg_1]
        
        max_arg_2 = self.max_arg(pair_1, final_pair)
        pair_2 = self.indices_to_pairs[max_arg_2]
        
        return pair_1, pair_2

Testing:

In [55]:
action_value_spec1 = ActionValue_Spec1()

pair_0 = randinterval()

final_pair = randinterval()

print('\nInitial state:', pair_0)

epsilon = 0.2

print('\nIndex of state after action with highest expected return:', action_value_spec1.epsilon_greedy(epsilon, pair_0, final_pair))

maximizing_argument = action_value_spec1.indices_to_pairs[action_value_spec1.epsilon_greedy(epsilon, pair_0, final_pair)]
print('State after action with highest expected return:', maximizing_argument)

k = action_value_spec1.possible_pair_indices[pair_0]
s = torch.tensor([float(i == k) for i in range(195)])
      
n = action_value_spec1.possible_pair_indices[final_pair]
f = torch.tensor([float(i == n) for i in range(195)])
        
index = action_value_spec1.possible_pair_indices[maximizing_argument]
a = torch.tensor([float(j == index) for j in range(195)])

model_input = torch.cat((s, a, f))
print('Value at this state:', action_value_spec1.forward(model_input).item(),'\n')

print('\n', action_value_spec1.double_max_arg(pair_0, final_pair))


Number of pairs: 195

Initial state: (58, 61)

Index of state after action with highest expected return: 184
State after action with highest expected return: (76, 81)
Value at this state: -4.5131360820960253e-05 


 ((76, 81), (61, 65))


### "First species" voice leading reinforcement learning agent.

#### Class: `Agent_FirstSpecies`
Parent(s): *none*

Constructor arguments:
* *key* = `default_key`, where *default_key* = `Key(root = 'C', mode = 'Major')`
* *start_interval* = `default_start_interval`, where *default_start_interval* = `['C♯7', 'E7']`
* *end_interval* = `default_end_interval`, where *default_end_interval* = `['C♯5', 'G5']`
* *expected_return* = `nn.Module()`

**Remark: Tip for next iteration.** I caused myself a bit of a headache by flip-flopping between inheritance and composition as I built all the classes above. In the nest iteration, I think it would make more sense to package all the underlying music theory classes by using them as constructors in one music theory "super class," and to package all the reward schemes together by using them as arguments in the constructor for a rewards super class. Then use these music theory and rewards super classes as arguments in the constructor for the learning agent.

In [56]:
class Agent_FirstSpecies():
    default_key = Key(root = 'C', mode = 'Major')
    default_start_interval = ('C♯5', 'E5')
    default_end_interval = ('C♯4', 'G4')
    
    default_accent_pattern = AccentPattern()
    
    default_actionvalue = ActionValue_Spec1()
    
    def __init__(self,
                 key = default_key,
                 start_interval = default_start_interval,
                 end_interval = default_end_interval,
                 expected_return = default_actionvalue,
                 accent_pattern = default_accent_pattern):
        
        super().__init__()
        
        assert isinstance(key, Key)
        assert isinstance(start_interval, tuple) and isinstance(end_interval, tuple)
        assert len(start_interval) == len(end_interval) == 2
        for entry in start_interval:
            assert entry in key.all_note_names
        for entry in end_interval:
            assert entry in key.all_note_names
            
        assert isinstance(expected_return, nn.Module)
        
        self.key = key
        self.start_interval = Interval(key = self.key, note_pair = start_interval)
        self.end_interval = Interval(key = self.key, note_pair = end_interval)
        
        self.end_interval_value = self.end_interval.note_values
        self.start_interval_value = self.start_interval.note_values
        
        self.expected_return = expected_return
        
        self.value_episode = [self.start_interval_value]
        
        possible_pair_indices = {}
        pair_counter = 0
        for i in range(57,81):
            for j in range(1,11):
                if i + j < 82:
                    possible_pair_indices.update({(i, i+j): copy.deepcopy(pair_counter)})
                    pair_counter += 1
        self.possible_pair_indices = possible_pair_indices
        
        indices_to_pairs = {}
        for pair in self.possible_pair_indices:
            index = self.possible_pair_indices[pair]
            indices_to_pairs.update({index: copy.deepcopy(pair)})
        self.indices_to_pairs = indices_to_pairs
        self.number_of_pairs = len(self.indices_to_pairs)
        
        final_index = self.possible_pair_indices[self.end_interval_value]
        end_vector = [float(i == final_index) for i in range(self.number_of_pairs)]
        end_tensor = torch.Tensor(end_vector)
        self.end_tensor = end_tensor
        
        tensor_episode = []
        pair_index = self.possible_pair_indices[self.start_interval.note_values]
        vector = [float(i == pair_index) for i in range(self.number_of_pairs)]
        tensor = torch.Tensor(vector)
        tensor_episode.append(copy.deepcopy(tensor))
        self.tensor_episode = tensor_episode
        
        self.accent_pattern = accent_pattern
        
        self.consonance_scheme = ConsonanceScheme()
        self.zero_step_scheme = ZeroStepScheme(self.key.root_class_name,
                                               self.key.mode)
        self.prog_to_fin_scheme = ProgtoFinScheme()
        self.step_scheme = StepScheme()
        self.triple_step_scheme = TripleStepScheme()
        
        
    def next_interval(self):
        last_pair = self.value_episode[-1]
        local_arg_max = copy.deepcopy(self.indices_to_pairs[self.expected_return.epsilon_greedy(epsilon, last_pair, self.end_interval_value)])
        
        self.value_episode.append(local_arg_max)
        
        new_index = self.possible_pair_indices[local_arg_max]
        current_vector = [float(i == new_index) for i in range(self.number_of_pairs)]
        current_tensor = torch.Tensor(copy.deepcopy(current_vector))
        self.tensor_episode.append(copy.deepcopy(current_tensor))
    

    def last_reward(self):
        
        running_reward = 0
        
        
        # ConsonanceScheme rewards:
        last_pair = self.value_episode[-1]
        last_pair_class = tuple([value%12 for value in last_pair])
        
        consonance_reward = self.consonance_scheme.consonance_reward(last_pair_class)
        running_reward += consonance_reward
        
        
        # ZeroStepScheme rewards:
        
        # Scale degree reward:
        if len(self.value_episode) >= 2:
            interval_A = self.value_episode[-2]
            interval_B = self.value_episode[-1]
            scale_degree_reward = self.zero_step_scheme.scale_degree_reward(interval_A, interval_B)
            
            running_reward += scale_degree_reward
        
        # tonic_accents_reward:
        beat_in_cycle_number = (len(self.value_episode) - 1)%(self.accent_pattern.total_beat_count)
        bar_number = int(np.floor(beat_in_cycle_number/self.accent_pattern.beats_per_measure))-1
        beat_in_measure_number = beat_in_cycle_number%(self.accent_pattern.beats_per_measure)
        
        accents_reward = self.zero_step_scheme.tonic_accents_reward(last_pair,
                                                                    bar_number = bar_number,
                                                                    beat_number = beat_in_measure_number,
                                                                    accent_pattern = self.accent_pattern)
        running_reward += accents_reward
        
        # extrema_triad_reward (one for cycle-wide extrema and one for measure-wide extrema):
        where_at = len(self.value_episode)
        measure_length = self.accent_pattern.beats_per_measure
        if where_at%measure_length == 0:
            last_complete_measure = self.value_episode[-(measure_length+1):-1]
            extrema_in_measure_reward = self.zero_step_scheme.extrema_triad_reward(interval_sequence = last_complete_measure)
            running_reward += extrema_in_measure_reward
        
        cycle_length = self.accent_pattern.total_beat_count
        if where_at%cycle_length == 0:
            last_complete_cycle = self.value_episode[-(cycle_length+1):-1]
            extrema_in_cycle_reward = self.zero_step_scheme.extrema_triad_reward(interval_sequence = last_complete_cycle)
            running_reward += extrema_in_cycle_reward
        
        
        # StepScheme rewards:
        if len(self.value_episode) >= 2:
            interval_A = self.value_episode[-2]
            interval_B = self.value_episode[-1]
            step_reward = self.step_scheme.step_reward(interval_A,
                                           interval_B)
            running_reward += step_reward
        
        
        # TripleStepScheme rewards:

        if len(self.value_episode) >= 3:
            interval_A = self.value_episode[-3]
            interval_B = self.value_episode[-2]
            interval_C = self.value_episode[-1]
            within_octave_reward = self.triple_step_scheme.within_octave_reward(interval_sequence = [interval_A, interval_B, interval_C])
            running_reward += within_octave_reward
        
        
        # ProgtoFinScheme rewards:
        prog_to_fin_scalar = 2.0
        
        if len(self.value_episode) >= 2:
            interval_A = self.value_episode[-2]
            interval_B = self.value_episode[-1]
            prog_to_fin_reward = self.prog_to_fin_scheme.prog_to_fin_reward(interval_0 = interval_A,
                                                                            interval_1 = interval_B,
                                                                            final_interval = self.end_interval_value)
            running_reward += prog_to_fin_scalar * prog_to_fin_reward
        
        
        # No-sticking reward:
        if len(self.value_episode) >= 2:
            interval_A = self.value_episode[-2]
            interval_B = self.value_episode[-1]
            
            if interval_A == interval_B:
                running_reward += -20
        
        return running_reward


Number of pairs: 195


Testing:

In [57]:
agent = Agent_FirstSpecies()
print('\n', agent.value_episode)
print(agent.tensor_episode)

epsilon = 0.2

action_value_spec1 = ActionValue_Spec1()

pair_0 = agent.value_episode[0]

print('\nInitial state:', pair_0)

agent.next_interval()
print('\n', agent.value_episode)
print('\n', len(agent.tensor_episode[-1]))

agent.next_interval()
print('\n', agent.value_episode)
print('\n', len(agent.tensor_episode[-1]))

agent.next_interval()
print('\n', agent.value_episode)
print('\n', len(agent.tensor_episode[-1]))

agent.last_reward()


 [(73, 76)]
[tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])]

Number of pairs: 195

Initial state: (73, 76)

 [(73, 76), (72, 74)]

 195

 [(73, 76), (72, 74), (68, 69)]

 195



-17.566457547512417

___

## Training loop(s).

### Attempt 1.
#### Verdict(s): 
* *Loss function unstable.*
### Improvement hypotheses, ordered by ease-of-investigation:
1. I am back-propping wrong, since I take gradient of *both* expected returns in $\big(R+v_{\mathbf{w}}(s_2,s_1)-v_{\mathbf{w}}(s_1,s_0)\big)^2$. Try taking gradient only with respect to the term $v_{\mathbf{w}}(s_1,s_0)$ under the square. ***FIXED***
2. I am using the same value for `start_interval` and `end_interval` every episode. This doesn't provide a very representative example of the dataset. Try using a different `start_interval` and `end_interval` every episode. ***FIXED***
3. I am updating my weights $\mathbf{w}$ after running a whole episode. This seems to be a form of *off-policy* learning, which is one third of the ["deadly triad"](https://arxiv.org/pdf/1812.02648.pdf). I'm already using another third of the deadly triad: *function approximation*. I don't fully understand what characterizes the last third of the deadly triad — *bootstrapping* — so I can't decide if I'm doing it in this training loop. Anyway, one way to get more "on-policy" for the training loop below would be to run backprop after each step of the episode, instead of the whole episode. ***FIXED***
4. It seems like the agent is taking large steps. Check scales & signs of reward functions. ***FIXED***
5. The agent seems to often have trouble moving in the direction of its final interval. It occurs to me that this is coming from an unintended inductive bias in my model. The action-value function, a.k.a. expected return, only depends on the current state and next state. But the agent should be making different choices when it has a different ending interval, so the model needs to have *ending interval* (and eventually *key*) as that are fixed by choice of agent.
6. Because the *beat count* is not a feature of each state of the model, the action-value function can't make use of it. But the model should probably behave differntly went it's near the beginning of a song versus the end of a song.
7. **Important!** Instead of using the *greedy policy*, used the $\epsilon$-*greedy policy* (see p. 100 of [[*Sutton & Barto*]](http://www.incompleteideas.net/book/the-book-2nd.html)). The underlying hypothesis here is that the agent is getting stuck in short cycles of states because it is not exploring enough. An $\epsilon$-greedy policy is one way to visit states beyond the greedy policy, while staying primarily commited to the greedy policy.

**Remarks.** Some notes about issue that came up as I de-bugged, imporved, and fine-tuned the agent and its action-value model:
* I'm noticing that sometimes during training, if the value of the loss function gets close to zero, it then explodes within a couple rounds of backprop. Bringing the learning rate down solves this. I've also tried some gradient clipping, but can't tell if it's really doing anything. Regardless, my real issue with this is that I can't understand why it's happening.
* *Interval sequence stabilizes way too soon.*
* The role of inductive biases is proving to be really interesting as a work on this agent and its action-value function.

**Ideas for future agents.**
* *Limit octave range.* Try severely restricting the octave range of each voice, for instance to 2 octaves, i.e., to a range of 24 MIDI values. This restricts the original state space $S$, with cardinality $\#S= 128^2 = 16384$, to a much smaller state space $S_{\text{res}}$ with cardinality $\# S_{\text{res}} = 24^2 = 576$. This will lead to much faster convergence and training. This strategy is also musically viable, since most instruments — including singing human voices — have a severely restricted normal octave range.

In [58]:
act_val = ActionValue_Spec1()


Number of pairs: 195


In [None]:
#notesnotesnotes = Notes()

#agent = Agent_FirstSpecies(start_interval = ('C♯5', 'E5'),
#                           end_interval = ('C4', 'G4'),
#                           expected_return = act_val)

episode_count = 10000
present_bias = 1.0
cycles_per_episode = 1
learning_rate = 0.001

epsilon = 0.2

beats_per_episode = cycles_per_episode * agent.accent_pattern.total_beat_count
print('\nBeats per episode:', beats_per_episode)

loss_function = nn.MSELoss().double()

# Optimizers specified in the torch.optim package
optimizer = torch.optim.SGD(agent.expected_return.parameters(), lr = learning_rate, momentum = 0.9)

for episode_number in tqdm(range(episode_count)):
    
    episode_loss = 0.0
    
    argument_that_maximizes = 0
    
    agent.start_interval_value = randinterval()
    
    agent.end_interval_value = randinterval()
    final_index = agent.possible_pair_indices[agent.end_interval_value]
    end_vector = [float(i == final_index) for i in range(agent.number_of_pairs)]
    end_tensor = torch.Tensor(end_vector)
    agent.end_tensor = end_tensor
    
    agent.value_episode = [agent.start_interval_value]

    tensor_episode = []
    pair_index = agent.possible_pair_indices[agent.start_interval_value]
    vector = [float(i == pair_index) for i in range(agent.number_of_pairs)]
    tensor = torch.Tensor(vector)
    tensor_episode.append(copy.deepcopy(tensor))
    agent.tensor_episode = tensor_episode

#    print('\nEpisode number: {} out of {}'.format(episode_number+1, episode_count))
    
    reward_list = []
    note_list = []
    note_list.append((notesnotesnotes.values_to_notes[agent.start_interval_value[0]][0],
                      notesnotesnotes.values_to_notes[agent.start_interval_value[1]][0]))
    
    # Episode-generating loop:
    for beat_counter in range(beats_per_episode):
        
        agent.next_interval()
        reward = copy.deepcopy(agent.last_reward())
        reward_list.append(reward)
        
#        print('State {}:'.format(beat_counter), agent.value_episode[beat_counter])
    
    # # Expected-return-adjusting loop:
    # for beat_number in range(beats_per_episode):

        # Zero your gradients for every batch, i.e., at every beat!
        optimizer.zero_grad()
        
        
        if 1 < beat_counter < beats_per_episode - 1:
            
            beat_number = beat_counter - 2
        
            previous_state = agent.value_episode[beat_number]
            previous_tensor = agent.tensor_episode[beat_number]
        
            current_state = agent.value_episode[beat_number + 1]
            current_tensor = agent.tensor_episode[beat_number + 1]
            
            action_1 = torch.cat((previous_tensor, current_tensor, agent.end_tensor))
            return_at_current_state = agent.expected_return.forward(action_1)
            
            current_reward = reward_list[beat_number]
            tensor_reward = torch.tensor([current_reward]).float()
            
            next_state = agent.value_episode[beat_number + 2]
            next_tensor = agent.tensor_episode[beat_number + 2]
            
            action_2 = torch.cat((current_tensor, next_tensor, agent.end_tensor))
            return_at_next_state = torch.tensor([agent.expected_return.forward(action_2).item()])
            #print(return_at_next_state)
            
            # print(current_state[0])
            # print(current_state[1])
            # print(notesnotesnotes.notes_to_values)
            note_list.append((notesnotesnotes.values_to_notes[current_state[0]][0],
                              notesnotesnotes.values_to_notes[current_state[1]][0]))
            
            # Compute the loss and its gradients
            loss = loss_function(tensor_reward + present_bias * return_at_next_state, return_at_current_state)
#           
            episode_loss += loss.item()
            
            loss.backward()

            # Adjust learning weights
            optimizer.step()
            
        
        elif beat_counter == beats_per_episode - 1:
            
            beat_number = beat_counter - 2
        
            previous_state = agent.value_episode[beat_number]
            previous_tensor = agent.tensor_episode[beat_number]
        
            current_state = agent.value_episode[beat_number + 1]
            current_tensor = agent.tensor_episode[beat_number + 1]
            
            action_1 = torch.cat((previous_tensor, current_tensor, agent.end_tensor))
            return_at_current_state = agent.expected_return.forward(action_1)
            
            current_reward = reward_list[beat_number]
            tensor_reward = torch.tensor([current_reward]).float()
            
            next_state = agent.value_episode[beat_number + 2]
        
            note_list.append((notesnotesnotes.values_to_notes[current_state[0]][0],
                              notesnotesnotes.values_to_notes[current_state[1]][0]))
        
            note_list.append((notesnotesnotes.values_to_notes[next_state[0]][0],
                              notesnotesnotes.values_to_notes[next_state[1]][0]))
            
            # Compute the loss and its gradients
            loss = loss_function(tensor_reward, return_at_current_state)

            episode_loss += loss.item()

            loss.backward()

            # Adjust learning weights
            optimizer.step()
            
#            print('Last step completed')
        
        nn.utils.clip_grad_norm_(parameters = agent.expected_return.parameters(), max_norm=10, norm_type=2.0)
        
     
    if episode_number%100 == 0:
        print('Average loss this episode:', episode_loss/8)
        print('Start interval:', agent.start_interval_value)
        print('End interval:', agent.end_interval_value)
        print('Epsiode as MIDI-value sequence:', agent.value_episode[0:8])
        print('Epsiode as note sequence:', note_list)
        print('Rewards:', reward_list[0:-1])
    #print('\n', reward_list)



Beats per episode: 8


  0%|                                       | 1/10000 [00:01<4:13:53,  1.52s/it]

Average loss this episode: 143.70694065093994
Start interval: (60, 65)
End interval: (70, 79)
Epsiode as MIDI-value sequence: [(60, 65), (63, 73), (67, 70), (59, 69), (59, 69), (59, 69), (59, 69), (59, 69)]
Epsiode as note sequence: [('C4', 'F4'), ('D♯4', 'C♯5'), ('G4', 'A♯4'), ('B3', 'A4'), ('B3', 'A4'), ('B3', 'A4'), ('B3', 'A4'), ('B3', 'A4')]
Rewards: [10.983342435227932, 11.507267942683661, -4.909990898105401, -17.055631923746425, -17.055631923746425, -17.055631923746425, -16.055631923746425]


  1%|▎                                    | 101/10000 [02:47<4:47:14,  1.74s/it]

Average loss this episode: 81.81544780731201
Start interval: (58, 63)
End interval: (75, 81)
Epsiode as MIDI-value sequence: [(58, 63), (60, 69), (59, 69), (68, 76), (69, 74), (59, 69), (68, 76), (59, 69)]
Epsiode as note sequence: [('A♯3', 'D♯4'), ('C4', 'A4'), ('B3', 'A4'), ('G♯4', 'E5'), ('A4', 'D5'), ('B3', 'A4'), ('G♯4', 'E5'), ('B3', 'A4')]
Rewards: [13.525520144465276, -6.922298590413093, 13.562073440342685, -5.734797412856095, -6.999949432420371, 12.562073440342685, -5.025713224559435]


  2%|▋                                    | 175/10000 [04:42<4:22:53,  1.61s/it]