In [2]:
import os
import json
import torch
import librosa
import numpy as np
from torch.utils.data import Dataset
import textgrid

In [3]:
#Hyper Parameters
SAMPLING_RATE = 22050
NO_MFCC = 13
HOP_LENGTH = 512
DURATION = 30

In [None]:
import os
import json
import textgrid




if __name__ == "__main__":
    textgrid_dir = "/path/to/textgrid/dir"
    json_dir = "/path/to/json/dir"

    # Create the json directory if it doesn't exist.
    if not os.path.exists(json_dir):
        os.makedirs(json_dir)

    # Iterate over all textgrid files in the textgrid directory and convert them to json files.
    for textgrid_file in os.listdir(textgrid_dir):
        textgrid_file_path = os.path.join(textgrid_dir, textgrid_file)
        json_file_path = os.path.join(json_dir, os.path.splitext(textgrid_file)[0] + ".json")

        convert_textgrid_to_json(textgrid_file_path, json_file_path)

In [12]:
class MusicSegmentationDataset(Dataset):
    def __init__(self, root_dir, sr=SAMPLING_RATE, n_mfcc=NO_MFCC, hop_length=HOP_LENGTH, duration=DURATION):
        self.root_dir = root_dir
        self.song_files = os.listdir(os.path.join(self.root_dir, 'songs'))
        self.metadata_dir = os.path.join(self.root_dir, 'metadata')
        self.sr = sr
        self.n_mfcc = n_mfcc
        self.hop_length = hop_length
        self.duration = duration

    def convert_textgrid_to_json(self,textgrid_file, json_file):
        textgrid_object = textgrid.TextGrid.load(textgrid_file)

        json_object = {
            "tiers": []
        }

        for tier in textgrid_object.tiers:
            json_tier = {
                "name": tier.name,
                "intervals": []
            }

            for interval in tier.intervals:
                json_interval = {
                    "xmin": interval.xmin,
                    "xmax": interval.xmax,
                    "text": interval.text
                }

                json_tier["intervals"].append(json_interval)

            json_object["tiers"].append(json_tier)

        with open(json_file, "w") as f:
            json.dump(json_object, f, indent=4)

    def _load_ground_truth(self, song_name):
        metadata_file = os.path.join(self.metadata_dir, f"{song_name}.json")
        if os.path.exists(metadata_file):
            with open(metadata_file, 'r') as json_file:
                metadata = json.load(json_file)
            return {
                'pallavi': torch.tensor(metadata['pallavi']),
                'anupallavi': torch.tensor(metadata['anupallavi']),
                'charanam': torch.tensor(metadata['charanam']),
                'bgm': torch.tensor(metadata['bgm'])
            }
        else:
            return None

    def _extract_features(self, music_file):
        y, sr = librosa.load(music_file, sr=self.sr, duration=self.duration)
        
        # Extract MFCCs
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=self.n_mfcc, hop_length=self.hop_length)
        
        # Extract chroma features
        chroma = librosa.feature.chroma_stft(y=y, sr=sr, hop_length=self.hop_length)
        
        # Extract mel-spectrogram features
        mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, hop_length=self.hop_length)
        
        return torch.tensor(mfcc.T, dtype=torch.float32), \
               torch.tensor(chroma.T, dtype=torch.float32), \
               torch.tensor(mel_spec.T, dtype=torch.float32)

    def __len__(self):
        return len(self.song_files)

    def __getitem__(self, idx):
        song_name = self.song_files[idx]
        music_file = os.path.join(self.root_dir, 'songs', song_name)
        ground_truth = self._load_ground_truth(song_name)

        if ground_truth is not None:
            mfcc, chroma, mel_spec = self._extract_features(music_file)
            return {
                'mfcc': mfcc,
                'chroma': chroma,
                'mel_spec': mel_spec,
                'ground_truth': ground_truth
            }
        else:
            return None


In [13]:
dataset = MusicSegmentationDataset(root_dir=os.getcwd())

In [19]:
dataset._load_ground_truth(dataset.song_files[0])

/workspace/Desktop/Final/Final-Year-Project-MIR/metadata/27_Anbendra Mazhaiyilae - Minsara Kanavu  Anuradha Sriram  A.R. Rahman.mp3.json


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

class MusicSegmentationModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(MusicSegmentationModel, self).__init__()
        
        # CNN layers for feature extraction
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=(3, 3), padding=(1, 1)),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2, 2)),
            nn.Conv2d(32, 64, kernel_size=(3, 3), padding=(1, 1)),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2, 2)),
            nn.Conv2d(64, 128, kernel_size=(3, 3), padding=(1, 1)),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2, 2))
        )
        
        # LSTM layers for sequence modeling
        self.lstm = nn.LSTM(input_size=128, hidden_size=hidden_size, num_layers=2, batch_first=True, dropout=0.2)
        
        # Fully connected layer for classification
        self.fc = nn.Linear(hidden_size, num_classes)
        
    def forward(self, x):
        # Apply CNN to input features
        x = self.cnn(x)
        
        # Reshape for LSTM
        x = x.view(x.size(0), x.size(1), -1)
        
        # Apply LSTM
        lstm_out, _ = self.lstm(x)
        
        # Get the output at each timestamp
        lstm_out = lstm_out[:, -1, :]
        
        # Apply fully connected layer for classification
        output = self.fc(lstm_out)
        
        return output

# Example usage:
input_size = 128  # Adjust this based on your input feature size
hidden_size = 64  # Adjust this based on your desired hidden size
num_classes = 4   # Number of classes: pallavi, anupallavi, charanam, bgm

model = MusicSegmentationModel(input_size, hidden_size, num_classes)

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Data preprocessing (assuming you have a dataset with input features and labels)
# Your labels may look something like this (timestamps for each segment type):
labels = {
    'pallavi': [(start1, end1), (start2, end2), ...],
    'anupallavi': [(start1, end1), (start2, end2), ...],
    'charanam': [(start1, end1), (start2, end2), ...],
    'bgm': [(start1, end1), (start2, end2), ...]
}

# You need to convert the labels to a format suitable for training (e.g., one-hot encoding or integer labels).

# Training loop (assuming you have input data as `input_data`)
for epoch in range(num_epochs):
    for inputs, segment_labels in zip(input_data, labels):
        optimizer.zero_grad()
        outputs = model(inputs)
        
        # Convert segment_labels to a suitable format (e.g., integer labels)
        # Apply appropriate loss function based on your label format
        
        loss = criterion(outputs, segment_labels)
        loss.backward()
        optimizer.step()
