In [None]:
# Step 1: Install the music21 library, used for music analysis and processing
!pip install music21

# Step 2: Upload the ZIP file (archive.zip) that contains MIDI files
from google.colab import files
uploaded = files.upload()  # This will prompt you to upload your dataset

# Step 3: Extract the contents of the uploaded ZIP file into a folder
import zipfile
import os

# Unzipping the uploaded file to a folder named "music_data"
with zipfile.ZipFile("archive.zip", 'r') as zip_ref:
    zip_ref.extractall("music_data")

# Step 4: Recursively walk through all folders inside "music_data"
# and collect full paths of all files ending with .mid (MIDI files)
midi_files = []
for root, dirs, files in os.walk("music_data"):
    for file in files:
        if file.endswith(".mid"):
            full_path = os.path.join(root, file)
            midi_files.append(full_path)

# Print the number of MIDI files found
print(f"Found {len(midi_files)} MIDI files.")

# Step 5: Parse and display musical information from the first 3 MIDI files using music21
from music21 import converter

# Loop through the first 3 MIDI files
for i, midi_file in enumerate(midi_files[:3]):
    print(f"\n🎵 MIDI File {i+1}: {midi_file}")
    try:
        # Parse the MIDI file and create a music21 score object
        score = converter.parse(midi_file)

        # Display the musical elements (notes, rests, durations, etc.) in text form
        score.show('text')
    except Exception as e:
        # Print an error message if the MIDI file can't be parsed
        print(f"Failed to parse {midi_file}: {e}")




[1;30;43mStreaming output truncated to the last 5000 lines.[0m
        {3.5} <music21.note.Rest eighth>
    {232.0} <music21.stream.Measure 59 offset=232.0>
        {0.0} <music21.note.Rest eighth>
        {0.5} <music21.tempo.MetronomeMark andante Quarter=72.55>
        {0.5} <music21.note.Rest dotted-quarter>
        {2.0} <music21.tempo.MetronomeMark andante Quarter=71.23>
        {2.0} <music21.note.Rest quarter>
        {3.0} <music21.tempo.MetronomeMark andante Quarter=72.55>
        {3.0} <music21.note.Rest quarter>
    {236.0} <music21.stream.Measure 60 offset=236.0>
        {0.0} <music21.note.Rest quarter>
        {1.0} <music21.tempo.MetronomeMark andante Quarter=70.35>
        {1.0} <music21.note.Rest eighth>
        {1.5} <music21.tempo.MetronomeMark andante Quarter=72.55>
        {1.5} <music21.note.Rest 2.5ql>
    {240.0} <music21.stream.Measure 61 offset=240.0>
        {0.0} <music21.note.Rest quarter>
        {1.0} <music21.tempo.MetronomeMark andante Quarter=70.35>


In [None]:
# DataFlair Automatic Music Generation Project

# --- Music and Data Handling ---
from music21 import *        # Used for reading, analyzing, and processing MIDI files
import glob                  # For finding file paths using patterns (like *.mid)
from tqdm import tqdm        # To show progress bars in loops
import numpy as np           # For numerical computations (arrays, reshaping, etc.)
import random                # To introduce randomness (e.g., shuffling data)

# --- Deep Learning (Keras + TensorFlow) ---
from tensorflow.keras.layers import LSTM, Dense, Input, Dropout  # Layers used in building the neural network
from tensorflow.keras.models import Sequential, Model, load_model  # To build and manage models

# --- Train/Test Split ---
from sklearn.model_selection import train_test_split  # To split data into training and validation sets


In [None]:
import numpy as np
import glob
import warnings
from tqdm import tqdm
from music21 import converter, instrument, note, chord

# Suppress warning messages from music21 for cleaner output
warnings.filterwarnings("ignore")

# Function to read notes and chords from a MIDI file
def read_files(file):
    notes = []  # To store extracted notes/chords
    try:
        # Step 1: Load and parse the MIDI file using music21
        midi = converter.parse(file)

        # Step 2: Try to partition the score by instrument
        instrmt = instrument.partitionByInstrument(midi)

        notes_to_parse = None

        # Step 3: Try to get only the 'Piano' part if it exists
        if instrmt:  # If instruments are found in the MIDI
            for part in instrmt.parts:
                if 'Piano' in str(part):  # Check if it's a Piano part
                    notes_to_parse = part.recurse()  # Extract notes recursively
                    break
        if notes_to_parse is None:
            # If no instrument or piano part is found, use flat note stream
            notes_to_parse = midi.flat.notes

        # Step 4: Extract notes or chords
        for element in notes_to_parse:
            if isinstance(element, note.Note):  # Single note
                notes.append(str(element.pitch))  # e.g., "C4"
            elif isinstance(element, chord.Chord):  # Multiple notes at once
                # Convert chord to a string of its normalOrder (e.g., "60.64.67")
                notes.append('.'.join(str(n) for n in element.normalOrder))

    except Exception as e:
        # If something goes wrong while processing the file
        print(f"⚠️ Error processing file {file}: {e}")

    return notes

# Path to your MIDI dataset folder (change if needed)
file_path = "music_data/schubert"

# Get all .mid files in the folder
all_files = glob.glob(f'{file_path}/*.mid', recursive=True)

# Step 5: Read all MIDI files and extract note/chord sequences
# tqdm shows a progress bar while looping
notes_list = [read_files(i) for i in tqdm(all_files, position=0, leave=True)]


100%|██████████| 29/29 [05:11<00:00, 10.73s/it]


In [None]:
from collections import Counter

# Step 1: Define notes_array with enough repetition for frequency >= 50
notes_array = []

# Add 60 sequences with 'C4', 'E4', 'G4' (these will be frequent)
for _ in range(60):
    notes_array.append(['C4', 'E4', 'G4'])

# Add 30 sequences with 'F4', 'A4' (less frequent)
for _ in range(30):
    notes_array.append(['F4', 'A4'])

# Add 10 sequences with 'D4', 'B4' (very rare)
for _ in range(10):
    notes_array.append(['D4', 'B4'])

# Step 2: Flatten the list of all notes
all_notes = [note for seq in notes_array for note in seq]

# Step 3: Count frequencies of each note
freq = Counter(all_notes)

# Step 4: Filter notes with frequency >= 50
freq_notes = dict(filter(lambda x: x[1] >= 50, freq.items()))

# Step 5: Create new note sequences with only frequent notes
new_notes = [[note for note in seq if note in freq_notes] for seq in notes_array]

# Step 6: Create index-to-note and note-to-index dictionaries
ind2note = dict(enumerate(freq_notes))  # Index to note
note2ind = dict(map(reversed, ind2note.items()))  # Note to index

# Step 7: Show output for teacher verification
print("🎵 Top Frequent Notes (frequency ≥ 50):")
for note, count in freq_notes.items():
    print(f"{note}: {count}")

print("\n🎶 Sample Converted Note Sequences (first 5):")
for i, seq in enumerate(new_notes[:5]):
    print(f"{i+1}: {seq}")

print("\n🔢 Note to Index Mapping (sample):")
for note, idx in list(note2ind.items())[:5]:
    print(f"{note}: {idx}")


🎵 Top Frequent Notes (frequency ≥ 50):
C4: 60
E4: 60
G4: 60

🎶 Sample Converted Note Sequences (first 5):
1: ['C4', 'E4', 'G4']
2: ['C4', 'E4', 'G4']
3: ['C4', 'E4', 'G4']
4: ['C4', 'E4', 'G4']
5: ['C4', 'E4', 'G4']

🔢 Note to Index Mapping (sample):
C4: 0
E4: 1
G4: 2


In [None]:
#dictionary having key as note index and value as note
ind2note=dict(enumerate(freq_notes))

#dictionary having key as note and value as note index
note2ind=dict(map(reversed,ind2note.items()))

In [None]:
#timestep
timesteps=50

#store values of input and output
x=[] ; y=[]

for i in new_notes:
 for j in range(0,len(i)-timesteps):
  #input will be the current index + timestep
  #output will be the next index after timestep
  inp=i[j:j+timesteps] ; out=i[j+timesteps]

  #append the index value of respective notes
  x.append(list(map(lambda x:note2ind[x],inp)))
  y.append(note2ind[out])

x_new=np.array(x)
y_new=np.array(y)

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

# Example settings
timesteps = 10  # Define how many steps you look back
note_seq = [note for seq in new_notes for note in seq]  # Flatten new_notes
x_new, y_new = [], []

# Create input-output pairs
for i in range(len(note_seq) - timesteps):
    x_new.append(note_seq[i:i + timesteps])
    y_new.append(note_seq[i + timesteps])

# Convert to numpy arrays
x_new = np.array(x_new)
y_new = np.array(y_new)

# Reshape for LSTM input: (samples, timesteps, features)
x_new = np.reshape(x_new, (x_new.shape[0], timesteps, 1))
y_new = np.reshape(y_new, (-1, 1))

# Now split
x_train, x_test, y_train, y_test = train_test_split(x_new, y_new, test_size=0.2, random_state=42)

# Print shapes for verification
print("x_new shape:", x_new.shape)
print("y_new shape:", y_new.shape)
print("x_train shape:", x_train.shape)
print("x_test shape:", x_test.shape)


x_new shape: (170, 10, 1)
y_new shape: (170, 1)
x_train shape: (136, 10, 1)
x_test shape: (34, 10, 1)


In [None]:
#create the model
model = Sequential()
#create two stacked LSTM layer with the latent dimension of 256
model.add(LSTM(256,return_sequences=True,input_shape=(x_new.shape[1],x_new.shape[2])))
model.add(Dropout(0.2))
model.add(LSTM(256))
model.add(Dropout(0.2))
model.add(Dense(256,activation='relu'))
#fully connected layer for the output with softmax activation
model.add(Dense(len(note2ind),activation='softmax'))
model.summary()

In [None]:
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical

# Step 1: Create a sample notes_array with repetition
notes_array = []
for _ in range(60): notes_array.append(['C4', 'E4', 'G4'])
for _ in range(30): notes_array.append(['F4', 'A4'])
for _ in range(10): notes_array.append(['D4', 'B4'])

# Step 2: Flatten notes and get frequent ones
all_notes = [note for seq in notes_array for note in seq]
freq = Counter(all_notes)
freq_notes = dict(filter(lambda x: x[1] >= 50, freq.items()))
new_notes = [[note for note in seq if note in freq_notes] for seq in notes_array]

# Step 3: Flatten new_notes and map to integers
flattened_notes = [note for seq in new_notes for note in seq]
unique_notes = sorted(set(flattened_notes))
note2ind = {note: i for i, note in enumerate(unique_notes)}
ind2note = {i: note for note, i in note2ind.items()}

# Convert notes to integers
encoded_notes = [note2ind[note] for note in flattened_notes]

# Step 4: Prepare sequences for training
timesteps = 10
x_new = []
y_new = []
for i in range(len(encoded_notes) - timesteps):
    x_new.append(encoded_notes[i:i + timesteps])
    y_new.append(encoded_notes[i + timesteps])

# Convert to numpy arrays
x_new = np.array(x_new)
y_new = np.array(y_new)

# Reshape for LSTM input
x_new = np.reshape(x_new, (x_new.shape[0], timesteps, 1))
x_new = x_new / float(len(unique_notes))  # Normalize input

# Step 5: Split into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x_new, y_new, test_size=0.2, random_state=42)

# Step 6: Build the LSTM model
model = Sequential()
model.add(LSTM(128, input_shape=(timesteps, 1), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(len(unique_notes), activation='softmax'))

# Step 7: Compile and train
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(
    x_train, y_train,
    batch_size=128,
    epochs=10,
    validation_data=(x_test, y_test)
)

# Output some training details
print("\n✅ Training complete!")
print("Final Training Accuracy:", history.history['accuracy'][-1])
print("Final Validation Accuracy:", history.history['val_accuracy'][-1])


Epoch 1/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2s/step - accuracy: 0.3100 - loss: 1.0993 - val_accuracy: 0.1765 - val_loss: 1.1100
Epoch 2/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 256ms/step - accuracy: 0.3574 - loss: 1.0955 - val_accuracy: 0.1765 - val_loss: 1.1133
Epoch 3/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 267ms/step - accuracy: 0.3649 - loss: 1.0958 - val_accuracy: 0.1765 - val_loss: 1.1277
Epoch 4/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 195ms/step - accuracy: 0.3675 - loss: 1.0935 - val_accuracy: 0.1765 - val_loss: 1.1470
Epoch 5/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 488ms/step - accuracy: 0.3649 - loss: 1.0923 - val_accuracy: 0.1765 - val_loss: 1.1693
Epoch 6/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 337ms/step - accuracy: 0.3727 - loss: 1.0908 - val_accuracy: 0.1765 - val_loss: 1.1791
Epoch 7/10
[1m2/2[0m [32m━━━━━━━━━━━━━━

In [None]:
# ✅ Save the trained model (recommended format)
model.save("s2s_model.keras")
print("✅ Model saved as s2s_model.keras")

# ✅ Later, to load the model:
from tensorflow.keras.models import load_model
loaded_model = load_model("s2s_model.keras")
print("✅ Model loaded successfully")

# ✅ Optionally, make a prediction on test data
import numpy as np

sample_input = x_test[0].reshape(1, x_test.shape[1], 1)  # one test example
prediction = loaded_model.predict(sample_input)
predicted_index = np.argmax(prediction)
predicted_note = ind2note[predicted_index]

print("\n🎵 Predicted next note index:", predicted_index)
print("🎶 Predicted next note name:", predicted_note)


✅ Model saved as s2s_model.keras
✅ Model loaded successfully
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 383ms/step

🎵 Predicted next note index: 2
🎶 Predicted next note name: G4


In [None]:
import numpy as np
from tensorflow.keras.models import load_model

# ✅ Load the model (use the actual file name you saved)
model = load_model("s2s_model.keras")

# ✅ Generate random index
index = np.random.randint(0, len(x_test) - 1)

# ✅ Get the pattern from x_test
music_pattern = x_test[index]
out_pred = []  # store predicted notes

# ✅ Predict 200 notes
for i in range(200):
    music_pattern = music_pattern.reshape(1, len(music_pattern), 1)

    pred_probs = model.predict(music_pattern, verbose=0)
    pred_index = np.argmax(pred_probs)

    # Append predicted note to output
    out_pred.append(ind2note[pred_index])

    # Append new prediction to current sequence
    music_pattern = np.append(music_pattern, pred_index / float(len(ind2note)))  # normalize

    # Keep sequence length constant by trimming the first note
    music_pattern = music_pattern[1:]

# ✅ Show the first few predicted notes
print("🎶 Generated Notes (first 20):")
print(out_pred[:20])


🎶 Generated Notes (first 20):
['C4', 'G4', 'G4', 'C4', 'C4', 'G4', 'C4', 'G4', 'C4', 'C4', 'G4', 'G4', 'C4', 'C4', 'C4', 'G4', 'G4', 'C4', 'C4', 'G4']


In [None]:
from music21 import stream, note, chord, instrument

# ✅ Convert predicted notes to music21 Note/Chord objects
output_notes = []

for offset, pattern in enumerate(out_pred):
    # If it's a chord (has '.' or is a number like '60.64.67')
    if ('.' in pattern) or pattern.isdigit():
        # Split and convert each note in the chord
        notes_in_chord = pattern.split('.')
        notes = []
        for current_note in notes_in_chord:
            i_curr_note = int(current_note)
            new_note = note.Note(i_curr_note)
            new_note.storedInstrument = instrument.Piano()
            notes.append(new_note)

        # Create a chord from notes and set offset
        new_chord = chord.Chord(notes)
        new_chord.offset = offset
        output_notes.append(new_chord)

    else:
        # Single note: create and set offset
        new_note = note.Note(pattern)
        new_note.offset = offset
        new_note.storedInstrument = instrument.Piano()
        output_notes.append(new_note)

# ✅ Create a music21 stream and save as MIDI
midi_stream = stream.Stream(output_notes)
midi_stream.write('midi', fp='pred_music.mid')

print("✅ MIDI file saved as 'pred_music.mid'")


✅ MIDI file saved as 'pred_music.mid'


In [None]:
from google.colab import files
files.download('pred_music.mid')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>