In [1]:
import os
from basic_pitch.inference import predict_and_save
from basic_pitch import ICASSP_2022_MODEL_PATH
from miditok import REMI, TokenizerConfig

list_paths = [os.path.join("data/labels", file) for file in os.listdir("data/labels/")]

predict_and_save(
    audio_path_list=list_paths,
    output_directory="out/",
    save_midi=True,
    sonify_midi=False,
    save_model_outputs=False,
    save_notes=False,
    model_or_model_path=ICASSP_2022_MODEL_PATH
)




Predicting MIDI for data/labels/Akuma no ko  attack on titan lofi remix.mp3...


  Creating midi...
  💅 Saved to out/Akuma no ko  attack on titan lofi remix_basic_pitch.mid

Predicting MIDI for data/labels/Circles.mp3...


  Creating midi...
  💅 Saved to out/Circles_basic_pitch.mid

Predicting MIDI for data/labels/Roxanne.mp3...


  Creating midi...
  💅 Saved to out/Roxanne_basic_pitch.mid

Predicting MIDI for data/labels/A thousand years.mp3...


  Creating midi...
  💅 Saved to out/A thousand years_basic_pitch.mid

Predicting MIDI for data/labels/Save your tears  lofi.mp3...


  Creating midi...
  💅 Saved to out/Save your tears  lofi_basic_pitch.mid

Predicting MIDI for data/labels/Strangers  lofi version.mp3...


  Creating midi...
  💅 Saved to out/Strangers  lofi version_basic_pitch.mid

Predicting MIDI for data/labels/Killing me softly with his song.mp3...


  Creating midi...
  💅 Saved to out/Killing me softly with his song_basic_pitch.mid

Predicting MIDI for data/labels/Gangsta

In [9]:
import yaml
with open('config.yaml', 'r') as ymlfile:
    cfg = yaml.safe_load(ymlfile)
cfg["LowDataset"]

{'data_path': 'audio_files.jsonl',
 'dir_inputs': 'data/inputs',
 'dir_labels': 'data/labels',
 'stereo': False,
 'batch_size': 1,
 'masking': False,
 'max_duration': 90,
 'pad_token': 1025,
 'codebook_size': 1027,
 'codebook_num': 8,
 'max_token_len': 7500}

In [1]:
import pickle
import pandas as pd
import torchaudio
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from typing import Tuple

from encodec import EncodecModel
from encodec.utils import convert_audio
from codebooks_patterns import DelayedPatternProvider

from IPython.display import Audio

data = pickle.load(open("data_path.pkl","rb"))
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [6]:
x = torch.tensor([[1,1,1,1], 
                  [2,2,2,2], 
                  [3,3,3,3]])

torch.stack([x] * 3)

tensor([[[1, 1, 1, 1],
         [2, 2, 2, 2],
         [3, 3, 3, 3]],

        [[1, 1, 1, 1],
         [2, 2, 2, 2],
         [3, 3, 3, 3]],

        [[1, 1, 1, 1],
         [2, 2, 2, 2],
         [3, 3, 3, 3]]])

In [2]:
class LowDataset(Dataset):
    def __init__(self, data_path: pd.DataFrame, stereo: bool = False, max_duration : int = 300 ,dir_inputs: str = "data/inputs", dir_labels: str = "data/labels", ):
        super().__init__()
        self.data_path = pickle.load(open(data_path,"rb"))
        
        if stereo:
            self.model = EncodecModel.encodec_model_48khz().to(DEVICE)
        else:
            self.model = EncodecModel.encodec_model_24khz().to(DEVICE)

        # The number of codebooks used will be determined bythe bandwidth selected.
        # E.g. for a bandwidth of 6kbps, `n_q = 8` codebooks are used.
        # Supported bandwidths are 1.5kbps (n_q = 2), 3 kbps (n_q = 4), 6 kbps (n_q = 8) and 12 kbps (n_q =16) and 24kbps (n_q=32).
        # For the 48 kHz model, only 3, 6, 12, and 24 kbps are supported. The number
        # of codebooks for each is half that of the 24 kHz model as the frame rate is twice as much.
        
        self.dir_inputs = dir_inputs
        self.dir_labels = dir_labels
        self.channels = 2 if stereo else 1
        self.sample_rate = 48000 if stereo else 24000
        self.mac_duration = max_duration
        self.model.set_target_bandwidth(6.0)

    def __len__(self) -> int:
        return len(self.data_path)

    def __getitem__(self, idx) -> Tuple[torch.Tensor]:
        input_path = self.data_path.iloc[idx]["input"]
        label_path = self.data_path.iloc[idx]["label"]
        
        input_wav, input_sr = torchaudio.load(f"{self.dir_inputs}/{input_path}")
        label_wav, label_sr = torchaudio.load(f"{self.dir_labels}/{label_path}")
        
        input_wav = convert_audio(input_wav, input_sr, self.sample_rate, self.channels)
        label_wav = convert_audio(label_wav, label_sr, self.sample_rate, self.channels)
        
        input_wav = self._pad_cut(input_wav, self.mac_duration, self.sample_rate)
        label_wav = self._pad_cut(label_wav, self.mac_duration, self.sample_rate)
        
        input_codes = self._get_codes(input_wav)
        label_codes = self._get_codes(label_wav)
        
        input_codes = self._pattern_provider(input_codes)
        label_codes = self._pattern_provider(label_codes)
        
        #input_codes = self._padding_codes(input_codes, 1000)
        #label_codes = self._padding_codes(label_codes, 1000)

        return input_codes, label_codes, label_wav
    
    def _get_codes(self, wav: torch.Tensor) -> torch.Tensor:
        with torch.no_grad():
            frames = self.model.encode(wav.unsqueeze(0).to(DEVICE))
        return torch.cat([encoded[0] for encoded in frames], dim=-1).squeeze(0)
    
    def _pattern_provider(self, wav: torch.Tensor, special_token: int = 1024) -> torch.Tensor:
        K, T = wav.shape
        pattern_provider = DelayedPatternProvider(K)
        partern = pattern_provider.get_pattern(24020)
        values , _ , _ = partern.build_pattern_sequence(wav.unsqueeze(0), special_token=special_token)
        return values.squeeze()
    
    def _padding_codes(self, codes: torch.Tensor, max_len: int, padding_token : int = 1025) -> torch.Tensor:
        K, T = codes.shape
        if T < max_len:
            pad = torch.full((K, max_len - T), padding_token)
            codes = torch.cat([codes, pad], dim=-1)
        return codes
    
    def _pad_cut(self, wav: torch.Tensor, max_len: int, sample_rate: int) -> torch.Tensor:
        if wav.shape[-1] > sample_rate * max_len:
            wav = wav[:, :max_len]
        else:
            pad = torch.zeros((1, (sample_rate * max_len) - wav.shape[-1]))
            wav = torch.cat([wav, pad], dim=-1)
        return wav

In [3]:
# Instantiate a pretrained EnCodec model
# model = EncodecModel.encodec_model_24khz().to(DEVICE)
# model.set_target_bandwidth(6.0)

dataset = LowDataset("data_path.pkl")
wav = dataset.__getitem__(6)
# # Load and pre-process the audio waveform
# wav, sr = torchaudio.load(f"data/inputs/{path}")
# print('Original File ',wav.shape)
# wav = convert_audio(wav, sr, model.sample_rate, model.channels)
# wavpt = wav.unsqueeze(0).to(device)
# print('Modificated File ',wav.shape)



In [7]:
wav[0]

tensor([[1024,   62,   62,  ..., 1024, 1024, 1024],
        [1024, 1024,  913,  ..., 1024, 1024, 1024],
        [1024, 1024, 1024,  ..., 1024, 1024, 1024],
        ...,
        [1024, 1024, 1024,  ..., 1024, 1024, 1024],
        [1024, 1024, 1024,  ..., 1024, 1024, 1024],
        [1024, 1024, 1024,  ..., 1024, 1024, 1024]])

In [5]:
pattern_provider = DelayedPatternProvider(8)
partern = pattern_provider.get_pattern(24020)

In [8]:
partern.revert_pattern_sequence(wav[0].unsqueeze(0).to(DEVICE), special_token=1024)[0]

tensor([[[  62,   62,   62,  ..., 1024, 1024, 1024],
         [ 913,  424,  424,  ..., 1024, 1024, 1024],
         [ 786,  786,  786,  ..., 1024, 1024, 1024],
         ...,
         [1011,  986,  986,  ..., 1024, 1024, 1024],
         [1002, 1002, 1002,  ..., 1024, 1024, 1024],
         [ 948,  975,  948,  ..., 1024, 1024, 1024]]])

In [18]:
torchaudio.save(uri="resampling.mp3", src=decoded[0].detach(), sample_rate=model.sample_rate)

In [5]:
data_path = pickle.load(open("data_path.pkl","rb"))

dir_inputs = "data/inputs"
dir_labels = "data/labels"

input_path = data_path.iloc[6]["input"]

input_wav, input_sr = torchaudio.load(f"{dir_inputs}/{input_path}")

input_wav = convert_audio(input_wav, input_sr, 24000, 1)

model = EncodecModel.encodec_model_24khz().to(DEVICE)
with torch.no_grad():
    frames = model.encode(input_wav.unsqueeze(0).to(DEVICE))



In [23]:
frames[0]

(tensor([[[ 62,  62,  62,  ...,  62,  62,  62],
          [913, 424, 424,  ..., 518, 518, 518],
          [786, 786, 786,  ..., 786, 786, 786],
          ...,
          [639, 639, 639,  ..., 535, 535, 535],
          [597, 534, 610,  ..., 474, 474, 474],
          [611, 676, 683,  ..., 567, 567, 567]]]),
 None)

In [4]:
data = LowDataset(data, device=device)

In [5]:
data.__getitem__(20)



(tensor([[[ 442,  442,  790,  ...,  554,  158,  833],
          [ 826,   40,  989,  ...,  489,  582,  408],
          [ 925,  821,  573,  ...,  657,  113,  531],
          [ 962,    0, 1012,  ...,  732,  798,  882]]]),
 tensor([[[214, 214, 214,  ..., 212, 445, 648],
          [976, 976, 976,  ..., 398,  31,  73],
          [925, 925, 925,  ..., 184, 370, 975],
          [962, 962, 607,  ..., 716, 817, 817]]]))

In [None]:
d = decoded_frames.cpu().detach()
d = d.numpy().squeeze().shape

In [None]:
Audio(d, rate=model.sample_rate)