In [None]:
pip install mido pydub



In [None]:
import torch
import torch.cuda
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import random
import os
from os import walk
from IPython.display import Audio

In [None]:
# MIDI To Tensor
import mido

def extract_notes(midi_file_path):
    midi = mido.MidiFile(midi_file_path)
    notes = []
    current_time = 0
    active_notes = {}

    for track in midi.tracks:
        for msg in track:
            current_time += msg.time
            if msg.type == 'note_on' and msg.velocity > 0:
                active_notes[msg.note] = (current_time, msg.velocity)
            elif msg.type in ['note_off', 'note_on'] and msg.note in active_notes:
                start_time, velocity = active_notes.pop(msg.note)
                duration = current_time - start_time
                notes.append([start_time, duration, msg.note, velocity])

    return notes


In [None]:
# Audio To Tensor
import librosa
import pydub

def audio_to_tensor(audio_path):
  audio_tensor, sr = librosa.load(audio_path) # sr will be standardized to 22050 for this project bc my pc can't handle more
  return audio_tensor

In [None]:
import torch
# Auxilliary Function
# Import MIDI & MP3 Files
midi_temp = []
midi_fin = []
midi_path = "/content/drive/MyDrive/MIDI4STRINGS_but_better/Training Data/WebScrape/MIDI/"
mp3_temp = []
mp3_fin = []
mp3_path = "/content/drive/MyDrive/MIDI4STRINGS_but_better/Training Data/WebScrape/MP3/"

# Extracts from MIDI and then does mp3s based on MIDI because there are extra mp3s rn
dir_path = "/content/drive/MyDrive/MIDI4STRINGS_but_better/Training Data/WebScrape/MIDI/"

# Parsing through files
for (dir_path, dir_names, file_names) in walk(dir_path):
  midi_temp.extend(file_names)
print(midi_temp) # Check

# Replacing .mid with .mp3
for file in midi_temp:
  mp3_temp.append(file[:-3] + "mp3")
print(mp3_temp) # Check

# Turning MIDI files into Tensor Data
for file in midi_temp:
  midi_file_path = midi_path + file
  tensor_data = extract_notes(midi_file_path)
  midi_fin.append(tensor_data)
print(midi_fin) # Check

# Turning MP3 files into Tensor Data
for file in mp3_temp:
  mp3_file_path = mp3_path + file
  tensor_data = audio_to_tensor(mp3_file_path)
  mp3_fin.append(tensor_data)
print(mp3_fin) # Check

['Canon_in_D__Violin_Solo_.mid', 'Bach_Cello_Suite_No._1_For_Violin.mid', 'Summer_-_Third_movement.mid', 'Solo_Violin_Caprice_No._24_in_A_Minor_-_N._Paganini_Op._1_No._24.mid', 'Concerto_in_A_minor_A_Vivaldi.mid', 'Czardas.mid', 'Fur_Elise.mid', 'The_Swan_Violin____C._Saint-Saens.mid', 'Hungarian_dance_No_5.mid', 'Swan_lake.mid', 'Solo_Violin_Sonata_No._1_in_G_Minor_-_J._S._Bach_BWV_1001.mid', 'Bach_Cello_Suite_No._1_in_G_Major_BWV_1007_Prelude_for_Violin.mid', 'Danse_Macabre.mid', 'Ode_to_Joy_-_Violin.mid', 'Spring-Four_seasons_vivaldi.mid', 'Solo_Violin_Partita_No._2_in_D_Minor_-_J._S._Bach_BWV_1004.mid', 'Nocturne_No._20_in_C_minor_for_Violin.mid', 'Paganiniana_-_Nathan_Milstein.mid', 'Meditation_from_Thais.mid', 'Nocturne_Op._9_No._2_for_Violin_Sarasate.mid']
['Canon_in_D__Violin_Solo_.mp3', 'Bach_Cello_Suite_No._1_For_Violin.mp3', 'Summer_-_Third_movement.mp3', 'Solo_Violin_Caprice_No._24_in_A_Minor_-_N._Paganini_Op._1_No._24.mp3', 'Concerto_in_A_minor_A_Vivaldi.mp3', 'Czardas.mp3

In [None]:
# Setting Midi_data & Audio_data
midi_data = midi_fin
audio_data = mp3_fin

In [None]:
device = "cudu" if torch.cuda.is_available() else "cpu"

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        # Embedding layer
        self.embedding = nn.Embedding(input_size, embedding_size)

        # LSTM layer
        self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers, batch_first=True)

    def forward(self, input):
        input = input.float() # Ensure input is of type FloatTensor
        embedded = self.embedding(input)
        outputs, (hidden, cell) = self.lstm(embedded)
        return hidden, cell

In [None]:
# ChatGPT Debugged
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        # LSTM layer
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)

    def forward(self, input):
        input = input.float() # Ensure input is of type FloatTensor
        outputs, (hidden, cell) = self.lstm(input)
        return hidden, cell

In [None]:
class Decoder(nn.Module):
    def __init__(self, hidden_size, output_size, num_layers):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        # LSTM layer
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers, batch_first=True)

        # Output layer
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, input, hidden, cell):
        input = input.float() # Ensure input is of type FloatTensor
        output, (hidden, cell) = self.lstm(input, (hidden, cell))
        prediction = self.fc(output)
        return prediction, hidden, cell

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = src.shape[0]
        max_len = trg.shape[1]

        # If trg is 2D, we assume it's not one-hot encoded, so set trg_vocab_size to 1
        if trg.dim() == 2:
            trg_vocab_size = 1
        else:
            trg_vocab_size = trg.shape[2]  # If trg is 3D, assume it's one-hot encoded

        # Encode the source sequence
        encoder_hidden, encoder_cell = self.encoder(src)

        # Initialize decoder input
        decoder_input = torch.zeros((batch_size, 1, self.decoder.hidden_size)).to(device)  # Replace with start token if applicable # Was: decoder_input = torch.zeros((batch_size, 1, trg_vocab_size)).to(device)
        decoder_hidden = encoder_hidden
        decoder_cell = encoder_cell

        # Initialize output tensor
        decoder_outputs = torch.zeros(batch_size, max_len, trg_vocab_size).to(device)

        for t in range(max_len):
            decoder_output, decoder_hidden, decoder_cell = self.decoder(decoder_input, decoder_hidden, decoder_cell)
            decoder_outputs[:, t, :] = decoder_output.squeeze(1) # Adjust correct dimension
            teacher_force = random.random() < teacher_forcing_ratio
            decoder_input = trg[:, t, :] if teacher_force else decoder_output.argmax(2)
            decoder_input = decoder_input.unsqueeze(1)  # Reshape for decoder input

        return decoder_outputs

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class SongDataset(Dataset):
    def __init__(self, midi_data, audio_data):
        self.midi_data = midi_data
        self.audio_data = audio_data

    def __len__(self):
        return len(self.midi_data)

    def __getitem__(self, idx):
        midi_sequence = torch.tensor(self.midi_data[idx], dtype=torch.float32)
        audio_sequence = torch.tensor(self.audio_data[idx], dtype=torch.float32)
        return midi_sequence, audio_sequence

# Assuming midi_data and audio_data are lists of songs
dataset = SongDataset(midi_data, audio_data)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

for batch in dataloader:
    midi_sequence, audio_sequence = batch
    print(f"midi_sequence shape: {midi_sequence.shape}")
    print(f"audio_sequence shape: {audio_sequence.shape}")
    break  # Remove this break to check all batches


midi_sequence shape: torch.Size([1, 181, 4])
audio_sequence shape: torch.Size([1, 2467584])


In [None]:
def train(model, data_loader, optimizer, criterion):
    model.train()
    total_loss = 0
    for i, (inputs, targets) in enumerate(data_loader):
        optimizer.zero_grad()
        outputs = model(inputs, targets)
        loss = criterion(outputs.view(-1, output_size), targets.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(data_loader)

In [None]:
input_size = 4  # Starting time, Duration, Pitch, Velocity
# embedding_size = 64
hidden_size = 256
num_layers = 2
output_size = 1  # Mono audio output
learning_rate = 0.001
batch_size = 1 # Batch size is 1 because I want it to do back propagation after each song

In [None]:
import torch.optim as optim
import torch.nn as nn
import matplotlib.pyplot as plt

# Initialize the encoder and decoder
encoder = Encoder(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers) # Removed embedding_size=embedding_size
decoder = Decoder(hidden_size=hidden_size, output_size=output_size, num_layers=num_layers)

# Initialize the Seq2Seq model
model = Seq2Seq(encoder, decoder).to(device)

# Loss function and optimizer
criterion = nn.MSELoss()  # Using MSE loss since we're predicting continuous audio values
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
num_epochs = 14  # Set this to however many epochs you want to train for
epoch_losses = []

for epoch in range(num_epochs):
    epoch_loss = train(model, dataloader, optimizer, criterion)
    epoch_losses.append(epoch_loss)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}')

# Plotting the loss
plt.figure(figsize=(10, 5))
plt.plot(range(1, num_epochs + 1), epoch_losses, marker='o', linestyle='-', color='b')
plt.title('Training Loss Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.grid(True)
plt.show()

print("Training complete!")


RuntimeError: input.size(-1) must be equal to input_size. Expected 256, got 1

In [None]:
# import torch
# from torch.utils.data import Dataset

# class MIDIAudioDataset(Dataset):
#     def __init__(self, midi_data, audio_data):
#         self.midi_data = midi_data
#         self.audio_data = audio_data

#     def __len__(self):
#         return len(self.midi_data)

#     def __getitem__(self, idx):
#         midi_sequence = torch.tensor(self.midi_data[idx], dtype=torch.float32)
#         audio_sequence = torch.tensor(self.audio_data[idx], dtype=torch.float32)
#         return midi_sequence, audio_sequence

In [None]:
# class MIDI_Audio_Dataset(Dataset):
#     def __init__(self, midi_files, audio_files, sr=22050, n_fft=2048, hop_length=512):
#         self.midi_files = midi_files
#         self.audio_files = audio_files
#         self.sr = sr
#         self.n_fft = n_fft
#         self.hop_length = hop_length

#     def __len__(self):
#         return len(self.midi_files)

#     def __getitem__(self, idx):
#         midi_file = mido.MidiFile(self.midi_files[idx])
#         audio, sr = librosa.load(self.audio_files[idx], sr=self.sr)

#         # MIDI processing
#         midi_data = []
#         time = 0
#         for msg in midi_file.tracks[0]:
#             if msg.type == 'note_on':
#                 midi_data.append([msg.note, msg.velocity, time])
#             elif msg.type == 'note_off':
#                 # Handle note_off events as needed (e.g., calculate duration)
#                 pass
#             time += msg.time

#         # Audio processing
#         spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=n_fft, hop_length=hop_length)
#         spectrogram = librosa.power_to_db(spectrogram, ref=np.max)

#         # Data alignment (placeholder, needs refinement)
#         # Assuming MIDI and audio are synchronized
#         aligned_midi_data = midi_data  # Replace with alignment logic

#         return torch.tensor(aligned_midi_data), torch.tensor(spectrogram)