In [12]:
import numpy as np
import tensorflow as tf
import pretty_midi
import random
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Starting with generating a MIDI file Programmatically
def create_midi_file(filename="bach_cello_suites.mid"):
    midi = pretty_midi.PrettyMIDI()
    instrument = pretty_midi.Instrument(program=42) 
    
    notes = [  # Random Cello Notes (Pitch, Duration)
        (60, 1.0), (62, 0.5), (64, 1.5), (65, 1.0), (67, 0.5), (69, 1.5), (71, 1.0), (72, 0.5)
    ]
    
    for pitch, duration in notes:
        note = pretty_midi.Note(velocity=100, pitch=pitch, 
                                start=midi.get_end_time(), end=midi.get_end_time() + duration)
        instrument.notes.append(note)
    
    midi.instruments.append(instrument)
    midi.write(filename)
    print(f"MIDI file created: {filename}")
    
create_midi_file()

# Extracting the Notes and Durations from MIDI
def midi_to_notes(midi_path):
    pm = pretty_midi.PrettyMIDI(midi_path)
    notes = []
    
    for instrument in pm.instruments:
        for note in instrument.notes:
            pitch = note.pitch  # Note (e.g., C4 = 60)
            duration = note.end - note.start  # Duration
            notes.append((pitch, duration))
    
    return notes

midi_file = "bach_cello_suites.mid"
notes_data = midi_to_notes(midi_file)

# Tokenizing the Notes and Durations
unique_notes = sorted(set(n[0] for n in notes_data))
unique_durations = sorted(set(n[1] for n in notes_data))

note_to_int = {note: i for i, note in enumerate(unique_notes)}
duration_to_int = {duration: i for i, duration in enumerate(unique_durations)}

int_to_note = {i: note for note, i in note_to_int.items()}
int_to_duration = {i: duration for duration, i in duration_to_int.items()}

tokenized_data = [(note_to_int[n], duration_to_int[d]) for n, d in notes_data]

# Converting to sequence format
seq_len = 5  # Sequence Length
input_notes = []
input_durations = []
output_notes = []
output_durations = []

for i in range(len(tokenized_data) - seq_len):
    seq = tokenized_data[i:i + seq_len]
    input_notes.append([n[0] for n in seq])
    input_durations.append([n[1] for n in seq])
    output_notes.append(tokenized_data[i + seq_len][0])
    output_durations.append(tokenized_data[i + seq_len][1])

input_notes = np.array(input_notes)
input_durations = np.array(input_durations)
output_notes = np.array(output_notes)
output_durations = np.array(output_durations)

# Building the Transformer Model
def build_transformer(note_vocab_size, duration_vocab_size, seq_len, embed_dim=128, num_heads=4, ff_dim=256):
    note_inputs = tf.keras.layers.Input(shape=(seq_len,))
    duration_inputs = tf.keras.layers.Input(shape=(seq_len,))
    
    note_embedding = tf.keras.layers.Embedding(note_vocab_size, embed_dim)(note_inputs)
    duration_embedding = tf.keras.layers.Embedding(duration_vocab_size, embed_dim)(duration_inputs)
    
    x = tf.keras.layers.Concatenate()([note_embedding, duration_embedding])
    
    attn_output = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)(x, x)
    x = tf.keras.layers.Add()([x, attn_output])
    x = tf.keras.layers.LayerNormalization()(x)
    
    x = tf.keras.layers.Dense(ff_dim, activation="relu")(x)
    x = tf.keras.layers.Dense(embed_dim)(x)
    x = tf.keras.layers.LayerNormalization()(x)
    
    x = tf.keras.layers.GlobalAveragePooling1D()(x)
    note_output = tf.keras.layers.Dense(note_vocab_size, activation="softmax", name="note_output")(x)
    duration_output = tf.keras.layers.Dense(duration_vocab_size, activation="softmax", name="duration_output")(x)
    
    model = tf.keras.Model(inputs=[note_inputs, duration_inputs], outputs=[note_output, duration_output])
    model.compile(
    loss=["sparse_categorical_crossentropy", "sparse_categorical_crossentropy"],
    optimizer="adam",
    metrics=["accuracy", "accuracy"]
)

    
    return model

model = build_transformer(len(note_to_int), len(duration_to_int), seq_len)
model.fit([input_notes, input_durations], [output_notes, output_durations], epochs=10, batch_size=16)

# Generating the new music
def generate_music(model, start_note, start_duration, length=20):
    note_sequence = [start_note]
    duration_sequence = [start_duration]
    
    for _ in range(length):
        input_notes = pad_sequences([note_sequence], maxlen=seq_len, padding="pre")
        input_durations = pad_sequences([duration_sequence], maxlen=seq_len, padding="pre")
        
        pred_note, pred_duration = model.predict([input_notes, input_durations])
        pred_note = np.argmax(pred_note)
        pred_duration = np.argmax(pred_duration)
        
        note_sequence.append(pred_note)
        duration_sequence.append(pred_duration)
    
    return [(int_to_note[n], int_to_duration[d]) for n, d in zip(note_sequence, duration_sequence)]

start_note = random.choice(list(note_to_int.values()))
start_duration = random.choice(list(duration_to_int.values()))

generated_music = generate_music(model, start_note, start_duration)
print("Generated Music:", generated_music)


MIDI file created: bach_cello_suites.mid
Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - duration_output_accuracy: 0.0000e+00 - loss: 4.6195 - note_output_accuracy: 0.3333
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - duration_output_accuracy: 0.6667 - loss: 2.0965 - note_output_accuracy: 0.6667
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - duration_output_accuracy: 1.0000 - loss: 1.2724 - note_output_accuracy: 0.6667
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - duration_output_accuracy: 1.0000 - loss: 0.7851 - note_output_accuracy: 1.0000
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - duration_output_accuracy: 1.0000 - loss: 0.6966 - note_output_accuracy: 0.6667
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - duration_output_accuracy: 1.0000 - loss: 0.4965 - no