<a href="https://colab.research.google.com/github/Tyler-Hattori/fretboard_ai/blob/fourth/fretboard_ai_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### imports

In [2]:
import numpy as np
from numpy import random
from numpy.linalg import inv
from IPython.display import Audio, display
from scipy.io.wavfile import read
import matplotlib.pyplot as plt
from scipy import signal
import torch
from torch import nn
import tensorflow as tf
import math
import matplotlib.pyplot as plt
from tensorflow import keras
from keras.layers import MultiHeadAttention, LayerNormalization, Layer, Dense, Dropout, Flatten, Embedding, Add
import os
from google.colab import files
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### audio feature extraction

In [151]:
def audio2tones(file_name, notes=3, play_audio=False, make_plots=False, Fs=44100):
  # params
  num_tones = 48
  res = 5
  num_tones = num_tones * res
  dur = 1 # second
  N = int(Fs * dur)
  drive_path = '/content/drive/My Drive/FretboardAI/'
  scale = ['E','F','F#','G','G#','A','A#','B','C','C#','D','D#']
  tone_labels = [scale[np.mod(int(i/res),len(scale))] if np.mod(i,res) == 0 else ' ' for i in range(num_tones)]
  Q = 8

  # load H matrix from drive
  H = np.load(drive_path + 'H'+str(res)+'.npy')

  # store and plot custom audio file
  sample_file_name = drive_path + 'audio/' + file_name

  if play_audio:
    # play audio
    print('Playing audio file: ' + sample_file_name)
    display(Audio(sample_file_name, autoplay=True))
    print()

  # extract tones
  input_data = read(sample_file_name)
  audio = input_data[1]
  audio = audio[0:N]
  audio = audio[:,0]
  tones = H @ audio
  tones = abs(tones)/N
  tones = tones**2
  tones = tones/max(tones)
  print()

  # group tones into note bins
  tones_consolidated = np.array([tones[np.argmax(tones[i*res:i*res+res])+i*res] for i in range(int(len(tones)/res))])
  tone_labels_consolidated = [scale[np.mod(i,len(scale))] for i in range(len(tones_consolidated))]

  if make_plots:
    # plot time signal
    plt.figure(figsize=(20,4))
    plt.plot(audio)
    plt.ylabel("Amplitude")
    plt.xlabel("Time")
    plt.title("Audio Sample")
    plt.show()
    print()

    # plot spectrogram
    f, t, Sxx = signal.spectrogram(audio, Fs, nfft=2048)
    plt.figure(figsize=(20,4))
    plt.pcolormesh(t, f, Sxx, shading='gouraud')
    plt.ylabel('Frequency [Hz]')
    plt.xlabel('Time [sec]')
    plt.ylim(0, 1000) # max freq
    plt.title('Spectrogram')
    plt.show()
    print()

    # plot tones vector
    plot = plt.figure(figsize=(20,4))
    plt.stem(tones)
    plt.xlabel('Tone')
    plt.ylabel('Amplitude')
    plt.title('Tones found in audio')
    plt.xticks(range(len(tones)),tone_labels)
    plt.show()
    print()

    # plot simplified tones vector
    plot = plt.figure(figsize=(20,4))
    plt.stem(tones_consolidated)
    plt.xlabel('Tone')
    plt.ylabel('Amplitude')
    plt.title('Tones found in audio (max pooling)')
    plt.xticks(range(len(tones_consolidated)),tone_labels_consolidated)
    plt.show()
    print()

  # print max tones
  max_tones  = tones_consolidated.argsort()[::-1][:notes]
  notes_found = [tone_labels_consolidated[i] for i in max_tones]
  print('Notes found are : ' + str(notes_found))
  print()

  # return simplified tones vector in binary
  tones_binary = np.zeros(len(tones_consolidated),int)
  for i in range(notes):
    idx = np.argmax(tones_consolidated)
    tones_binary[idx] = 1
    tones_consolidated[idx] = 0

  # convert binary array to a sequence of tokens. Tokens are made by converting sets of Q binary entries in tones to decimal
  tokens = [0 for i in range(int(num_tones/(Q*res)))]
  binaries = [pow(2,i) for i in range(Q)]
  for i in range(int(num_tones/(Q*res))):
    b = tones_binary[i*Q:i*Q+Q]
    tokens[i] = np.inner(binaries,b)

  return tokens

### transformer models

#### tones2tab

In [94]:
# params
T = 48
Q = 8 # T/Q should be an integer
N = int(T/Q)
D = 96
dff = 4*D
H = 8
L = 6
dropout_rate = 0.1
batch_size = 32
t2t_vocab_length = pow(2,Q)
t2t_output_dim = 25 # one token for each of 24 possible frets and another to denote the string is muted
A = 440 # Hz
guitar = [[A*pow(2,(-29+i)/12) for i in range(24)],
          [A*pow(2,(-24+i)/12) for i in range(24)],
          [A*pow(2,(-19+i)/12) for i in range(24)],
          [A*pow(2,(-14+i)/12) for i in range(24)],
          [A*pow(2,(-10+i)/12) for i in range(24)],
          [A*pow(2,(-5+i)/12) for i in range(24)]]

def t2t_positional_encoding(length, depth):
  depth = depth/2

  positions = np.arange(length)[:, np.newaxis]     # (seq, 1)
  depths = np.arange(depth)[np.newaxis, :]/depth   # (1, depth)

  angle_rates = 1 / (10000**depths)         # (1, depth)
  angle_rads = positions * angle_rates      # (pos, depth)

  pos_encoding = np.concatenate([np.sin(angle_rads), np.cos(angle_rads)], axis=-1)

  return tf.cast(pos_encoding, dtype=tf.float32)

class T2TPositionalEmbedding(Layer):
  def __init__(self, vocab_size=t2t_vocab_length, dim_model=D):
    super().__init__()
    self.d_model = dim_model
    self.embedding = Embedding(input_dim=vocab_size, output_dim=dim_model, mask_zero=True)
    self.pos_encoding = t2t_positional_encoding(length=2048, depth=dim_model)

  def call(self, x):
    chords = tf.shape(x)[1]
    x = self.embedding(x)
    # This factor sets the relative scale of the embedding and positonal_encoding.
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x = x + self.pos_encoding[tf.newaxis, :chords, :]
    return x

class T2TBaseAttention(Layer):
  def __init__(self, **kwargs):
    super().__init__()
    self.mha = MultiHeadAttention(**kwargs)
    self.layernorm = LayerNormalization()
    self.add = Add()

class T2TGlobalSelfAttention(T2TBaseAttention):
  def call(self, x):
    attn_output = self.mha(query=x, value=x, key=x)
    x = self.add([x, attn_output])
    return self.layernorm(x)

class T2TFeedForward(Layer):
  def __init__(self, dim_model=D, dim_mlp=dff, dropout_rate=dropout_rate):
    super().__init__()
    self.seq = tf.keras.Sequential([
      Dense(dim_mlp, activation='relu'),
      Dense(dim_model),
      Dropout(dropout_rate)
    ])
    self.add = Add()
    self.layer_norm = LayerNormalization()

  def call(self, x):
    x = self.add([x, self.seq(x)])
    return self.layer_norm(x)

class T2TEncoder(Layer):
  def __init__(self,*, dim_model=D, num_heads=H, dim_mlp=dff, dropout_rate=dropout_rate):
    super().__init__()
    self.self_attention = T2TGlobalSelfAttention(
        num_heads=num_heads,
        key_dim=dim_model,
        dropout=dropout_rate)
    self.mlp = T2TFeedForward(dim_model=dim_model, dim_mlp=dim_mlp)

  def call(self, x):
    x = self.self_attention(x)
    x = self.mlp(x)
    return x

class Tones2Tab(tf.keras.Model):
    def __init__(self, *, output_dim=t2t_output_dim, dim_model=D, seq_length=N, dim_mlp=dff, L=L, dropout_rate=dropout_rate):
        super().__init__()
        self.dim_model = dim_model
        self.L = L

        self.embed = T2TPositionalEmbedding()
        self.encoder_layers = [T2TEncoder() for _ in range(L)]
        self.dropout = Dropout(dropout_rate)

        self.mlp_head1 = tf.keras.Sequential([
          Dense(dim_mlp, activation='relu'),
          Dense(output_dim)
        ])
        self.mlp_head2 = tf.keras.Sequential([
          Dense(dim_mlp, activation='relu'),
          Dense(output_dim)
        ])
        self.mlp_head3 = tf.keras.Sequential([
          Dense(dim_mlp, activation='relu'),
          Dense(output_dim)
        ])
        self.mlp_head4 = tf.keras.Sequential([
          Dense(dim_mlp, activation='relu'),
          Dense(output_dim)
        ])
        self.mlp_head5 = tf.keras.Sequential([
          Dense(dim_mlp, activation='relu'),
          Dense(output_dim)
        ])
        self.mlp_head6 = tf.keras.Sequential([
          Dense(dim_mlp, activation='relu'),
          Dense(output_dim)
        ])

    def call(self, x): # x is (None, N) where None refers to batch size
        b = tf.shape(x)[0]
        x = self.embed(x) # (None, N, dim_model)
        x = self.dropout(x)
        for i in range(self.L): x = self.encoder_layers[i](x)

        flatten = Flatten()
        x = flatten(x) # (None, N*dim_model)
        s1 = tf.nn.softmax(self.mlp_head1(x)) # (None, number of frets)
        s2 = tf.nn.softmax(self.mlp_head2(x)) # (None, number of frets)
        s3 = tf.nn.softmax(self.mlp_head3(x)) # (None, number of frets)
        s4 = tf.nn.softmax(self.mlp_head4(x)) # (None, number of frets)
        s5 = tf.nn.softmax(self.mlp_head5(x)) # (None, number of frets)
        s6 = tf.nn.softmax(self.mlp_head6(x)) # (None, number of frets)

        # consolidate csf outputs
        y = []
        for i in range(b):
          y.append([s1[i],s2[i],s3[i],s4[i],s5[i],s6[i]])
        out = tf.stack(y)

        return out

t2t = Tones2Tab()
t2t.load_weights('/content/drive/My Drive/FretboardAI/tones2tab')

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x7be1297cb640>

In [95]:
def tones2tab(tones,model=t2t):
  input = tf.convert_to_tensor([tones], dtype=tf.int32)
  soft = model(input)[0]
  output = []
  for i in range(len(tones)):
    output.append(int(tf.math.argmax(soft[i])))
  return output

In [143]:
def print_tab(tab):
  out = [str(tab[i]-1) if tab[i] != 0 else '-' for i in range(len(tab))]
  print('Guitar tablature: ')
  for i in range(len(tab)):
    print(out[i])
  print()
  return out

#### tab2chord

In [96]:
N = 6 # guitar tab input
D = 512
H = 8
L = 6
dff = D*4
batch_size = 32
t2c_vocab_length = 25
t2c_output_dim = 15 # csf output
dropout_rate = 0.1

def t2c_positional_encoding(length, depth):
  depth = depth/2

  positions = np.arange(length)[:, np.newaxis]     # (seq, 1)
  depths = np.arange(depth)[np.newaxis, :]/depth   # (1, depth)

  angle_rates = 1 / (10000**depths)         # (1, depth)
  angle_rads = positions * angle_rates      # (pos, depth)

  pos_encoding = np.concatenate([np.sin(angle_rads), np.cos(angle_rads)], axis=-1)

  return tf.cast(pos_encoding, dtype=tf.float32)

class T2CPositionalEmbedding(Layer):
  def __init__(self, vocab_size=t2c_vocab_length, dim_model=D):
    super().__init__()
    self.d_model = dim_model
    self.embedding = Embedding(input_dim=vocab_size, output_dim=dim_model, mask_zero=True)
    self.pos_encoding = t2c_positional_encoding(length=2048, depth=dim_model)

  def call(self, x):
    chords = tf.shape(x)[1]
    x = self.embedding(x)
    # This factor sets the relative scale of the embedding and positonal_encoding.
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x = x + self.pos_encoding[tf.newaxis, :chords, :]
    return x

class T2CBaseAttention(Layer):
  def __init__(self, **kwargs):
    super().__init__()
    self.mha = MultiHeadAttention(**kwargs)
    self.layernorm = LayerNormalization()
    self.add = Add()

class T2CGlobalSelfAttention(T2CBaseAttention):
  def call(self, x):
    attn_output = self.mha(query=x, value=x, key=x)
    x = self.add([x, attn_output])
    return self.layernorm(x)

class T2CFeedForward(Layer):
  def __init__(self, dim_model=D, dim_mlp=dff, dropout_rate=dropout_rate):
    super().__init__()
    self.seq = tf.keras.Sequential([
      Dense(dim_mlp, activation='relu'),
      Dense(dim_model),
      Dropout(dropout_rate)
    ])
    self.add = Add()
    self.layer_norm = LayerNormalization()

  def call(self, x):
    x = self.add([x, self.seq(x)])
    return self.layer_norm(x)

class T2CEncoder(Layer):
  def __init__(self,*, dim_model=D, num_heads=H, dim_mlp=dff, dropout_rate=dropout_rate):
    super().__init__()
    self.self_attention = T2CGlobalSelfAttention(
        num_heads=num_heads,
        key_dim=dim_model,
        dropout=dropout_rate)
    self.mlp = T2CFeedForward(dim_model=dim_model, dim_mlp=dim_mlp)

  def call(self, x):
    x = self.self_attention(x)
    x = self.mlp(x)
    return x

class Tab2Chord(tf.keras.Model):
    def __init__(self, *, output_dim=t2c_output_dim, dim_model=D, seq_length=N, dim_mlp=dff, L=L, dropout_rate=dropout_rate):
        super().__init__()
        self.dim_model = dim_model
        self.L = L

        self.embed = T2CPositionalEmbedding()
        self.encoder_layers = [T2CEncoder() for _ in range(L)]
        self.dropout = Dropout(dropout_rate)

        self.mlp_head1 = tf.keras.Sequential([
          Dense(dim_mlp, activation='relu'),
          Dense(output_dim)
        ])
        self.mlp_head2 = tf.keras.Sequential([
          Dense(dim_mlp, activation='relu'),
          Dense(output_dim)
        ])
        self.mlp_head3 = tf.keras.Sequential([
          Dense(dim_mlp, activation='relu'),
          Dense(output_dim)
        ])

    def call(self, x): # x is (None, N) where None refers to batch size
        b = tf.shape(x)[0]
        x = self.embed(x) # (None, N, dim_model)
        x = self.dropout(x)
        for i in range(self.L): x = self.encoder_layers[i](x)

        flatten = Flatten()
        x = flatten(x) # (None, N*dim_model)
        c = tf.nn.softmax(self.mlp_head1(x)) # (None, number of frets)
        s = tf.nn.softmax(self.mlp_head2(x)) # (None, number of frets)
        f = tf.nn.softmax(self.mlp_head3(x)) # (None, number of frets)

        # consolidate csf outputs
        y = []
        for i in range(b):
          y.append([c[i], s[i], f[i]])
        out = tf.stack(y)

        return out

t2c = Tab2Chord()
t2c.load_weights('/content/drive/My Drive/FretboardAI/tab2chord')

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x7be129c7ebf0>

In [97]:
def tab2chord(tab,model=t2c):
  input = tf.convert_to_tensor(tab, dtype=tf.int32)
  soft = model(input)
  output = []
  for i in range(len(tab)):
    output.append([int(tf.math.argmax(soft[i][0])),int(tf.math.argmax(soft[i][1])),int(tf.math.argmax(soft[i][2]))])
  return output

In [98]:
def chord2token(chord):
  return chord[0]*5*15 + chord[1]*15 + chord[2] + 1

In [99]:
def chord2name(chord):
  estring = ['E','F','F#','G','G#','A','A#','B','C','C#','D','D#','E','F','F#','G','A','A#']
  astring = ['A','A#','B','C','C#','D','D#','E','F','F#','G','G#','A','A#','B','C','C#','D']
  dstring = ['D','D#','E','F','F#','G','G#','A','A#','B','C','C#','D','D#','E','F','F#','G']
  name = ''
  color = chord[0]
  shape = chord[1]
  bar = chord[2]
  if shape == 0: name = name + astring[bar+3]
  elif shape == 1: name = name + astring[bar]
  elif shape == 2: name = name + estring[bar+3]
  elif shape == 3: name = name + estring[bar]
  elif shape == 4: name = name + dstring[bar]
  if color == 1: name = name + 'm'
  return name

In [135]:
def print_chord(chord):
  color = chord[0][0]
  shape = chord[0][1]
  bar = chord[0][2]
  if color == 0: print('Sounds like a major chord')
  elif color == 1: print('Sounds like a minor chord')
  if shape == 0: print('using the C chord shape')
  elif shape == 1: print('using the A chord shape')
  elif shape == 2: print('using the G chord shape')
  elif shape == 3: print('using the E chord shape')
  elif shape == 4: print('using the D chord shape')
  if bar != 0: print('with a capo on fret ' + str(bar))
  print()

#### seq2key

In [122]:
N = 4
D = 512
H = 8
L = 6
dff = D*4
batch_size = 32
s2k_vocab_length = 150 # possible fretboard shapes the code learns from
s2k_output_dim = 12 # possible key labels
dropout_rate = 0.1

def s2k_positional_encoding(length, depth):
  depth = depth/2

  positions = np.arange(length)[:, np.newaxis]     # (seq, 1)
  depths = np.arange(depth)[np.newaxis, :]/depth   # (1, depth)

  angle_rates = 1 / (10000**depths)         # (1, depth)
  angle_rads = positions * angle_rates      # (pos, depth)

  pos_encoding = np.concatenate([np.sin(angle_rads), np.cos(angle_rads)], axis=-1)

  return tf.cast(pos_encoding, dtype=tf.float32)

class S2KPositionalEmbedding(Layer):
  def __init__(self, vocab_size=s2k_vocab_length, dim_model=D):
    super().__init__()
    self.d_model = dim_model
    self.embedding = Embedding(input_dim=vocab_size, output_dim=dim_model, mask_zero=True)
    self.pos_encoding = s2k_positional_encoding(length=2048, depth=dim_model)

  def call(self, x):
    chords = tf.shape(x)[1]
    x = self.embedding(x)
    # This factor sets the relative scale of the embedding and positonal_encoding.
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x = x + self.pos_encoding[tf.newaxis, :chords, :]
    return x

class S2KBaseAttention(Layer):
  def __init__(self, **kwargs):
    super().__init__()
    self.mha = MultiHeadAttention(**kwargs)
    self.layernorm = LayerNormalization()
    self.add = Add()

class S2KGlobalSelfAttention(S2KBaseAttention):
  def call(self, x):
    attn_output = self.mha(query=x, value=x, key=x)
    x = self.add([x, attn_output])
    return self.layernorm(x)

class S2KFeedForward(Layer):
  def __init__(self, dim_model=D, dim_mlp=dff, dropout_rate=dropout_rate):
    super().__init__()
    self.seq = tf.keras.Sequential([
      Dense(dim_mlp, activation='relu'),
      Dense(dim_model),
      Dropout(dropout_rate)
    ])
    self.add = Add()
    self.layer_norm = LayerNormalization()

  def call(self, x):
    x = self.add([x, self.seq(x)])
    return self.layer_norm(x)

class S2KEncoder(Layer):
  def __init__(self,*, dim_model=D, num_heads=H, dim_mlp=dff, dropout_rate=dropout_rate):
    super().__init__()
    self.self_attention = S2KGlobalSelfAttention(
        num_heads=num_heads,
        key_dim=dim_model,
        dropout=dropout_rate)
    self.mlp = S2KFeedForward(dim_model=dim_model, dim_mlp=dim_mlp)

  def call(self, x):
    x = self.self_attention(x)
    x = self.mlp(x)
    return x

class Seq2Key(tf.keras.Model):
    def __init__(self, *, output_dim=s2k_output_dim, dim_model=D, seq_length=N, dim_mlp=dff, L=L, dropout_rate=dropout_rate):
        super().__init__()
        self.dim_model = dim_model
        self.L = L

        self.embed = S2KPositionalEmbedding()
        self.encoder_layers = [S2KEncoder() for _ in range(L)]
        self.dropout = Dropout(dropout_rate)

        self.mlp_head = tf.keras.Sequential([
          Dense(dim_mlp, activation='relu'),
          Dense(output_dim)
        ])

    def call(self, x): # x is (None, N) where None refers to batch size
        x = self.embed(x) # (None, N, dim_model)
        x = self.dropout(x)
        for i in range(self.L): x = self.encoder_layers[i](x)

        flatten = Flatten()
        x = flatten(x) # (None, N*dim_model)
        pred = self.mlp_head(x) # (None, 12)

        pred = tf.nn.softmax(pred)

        return pred

s2k = Seq2Key()
s2k.load_weights('/content/drive/My Drive/FretboardAI/seq2key')

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x7be1134e6020>

In [126]:
def seq2key(seq,model=s2k):
  if len(seq) > 4:
    seq = seq[-4:]
  elif len(seq) < 4:
    for i in range(4-len(seq)): seq.append(seq[np.mod(i,len(seq))])
  input = tf.convert_to_tensor([seq], dtype=tf.int32)
  soft = model(input)[0]
  key_names = ['C','C#','D','D#','E','F','F#','G','G#','A','A#','B','Cm','C#m','Dm','D#m','Em','Fm','F#m','Gm','G#m','Am','A#m','Bm']
  return key_names[np.argmax(soft)]

### main audio conversion function

In [139]:
def convert(input,output='tab',make_plots=False,play_audio=False):
  # audio frequency component extraction
  tones_seq = audio2tones(input,play_audio=play_audio,make_plots=make_plots)

  # convert frequency content to reasonable guitar tablature
  tab = tones2tab(tones_seq)
  neat_tab = print_tab(tab)

  # express guitar tablature as a sequence of fretboard patterns
  chord = tab2chord([tab])
  print_chord(chord)

  # convert the sequence of fretboard patterns to a sequence of tokens
  seq = [] # chord tokens
  prog = [] # chord names
  for i in range(len(chord)):
    chord_name = chord2name(chord[i])
    print('Chord '+str(i+1)+': ' + chord_name)
    prog.append(chord_name)
    seq.append(chord2token(chord[i]))
  print()

  # classify the sequence of tokens as one of 12 musical keys
  key = seq2key(seq)
  print('Sounds like the key of ' + key)

  return neat_tab, chord, prog, key

### testing

In [152]:
tab, pattern, chord, key = convert('-354--.wav',play_audio=False,make_plots=False)

  input_data = read(sample_file_name)



Notes found are : ['B', 'B', 'F#']

Guitar tablature: 
-
2
4
4
-
-

Sounds like a minor chord
using the A chord shape
with a capo on fret 2

Chord 1: Bm

Sounds like the key of D
