<a href="https://colab.research.google.com/github/PratikStar/google-colab/blob/main/All_in_one_Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Some notes
Original owner: pratik-sutar@g.ecc.u-tokyo.ac.jp



## Imports & Drive mount

In [13]:
import os
import pickle
import re
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
import inspect
import soundfile as sf
import math
import sys
from random import random

from google.colab import drive
drive.mount('/content/drive', force_remount=False)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Pipeline Classes & Code

In [24]:
'''
Get a clip
Randomly select a batch
    - chop into segments
    - Convert to spectrogram
Input to VAE

'''
CLIPS_DIR = "/content/drive/Shareddrives/timbre-space-drive/01-audio-data/01-clips"
DI_CLIP = "/content/drive/Shareddrives/timbre-space-drive/01-audio-data/DI.wav"
CLIP_DURATION = None
CLIP_STFT_MIN = None
CLIP_STFT_MAX = None
SAMPLE_RATE = 22050
STFT_FRAMESIZE = 512


segment_size = 0.743 # in sec.
batch_size = 10


# Extracts duration and min/max of stft from the original clip as a whole
def run_once():
    global CLIP_DURATION, CLIP_STFT_MIN, CLIP_STFT_MAX
    y, sr = librosa.load(DI_CLIP, sr=SAMPLE_RATE, mono=True)
    CLIP_DURATION = librosa.get_duration(y=y, sr=sr)

    stft = librosa.stft(y,
                        n_fft=STFT_FRAMESIZE,
                        hop_length=STFT_FRAMESIZE // 2)[:-1]
    spectrogram = np.abs(stft)
    phases = np.angle(stft)
    log_spectrogram = librosa.amplitude_to_db(spectrogram)
    CLIP_STFT_MIN = log_spectrogram.min()
    CLIP_STFT_MAX = log_spectrogram.max()

run_once()

def get_random_batch():
    load_duration = segment_size * batch_size # in sec
    if load_duration > CLIP_DURATION:
        print("ERROR: Insufficient Clip duration. Reduce the batch_size or segment_size")
        return None
    random_seed = random()
    load_offset = random_seed * (CLIP_DURATION - (load_duration))
    print(f"Loading clip from offset: {load_offset}s")

    di_signal = librosa.load(DI_CLIP,
                        sr=SAMPLE_RATE,
                        duration=load_duration,
                        offset=load_offset,
                        mono=True)[0]

    stft = librosa.stft(di_signal,
                        n_fft=STFT_FRAMESIZE,
                        hop_length=STFT_FRAMESIZE // 2)[:-1]
    spectrogram = np.abs(stft)
    print(spectrogram.shape)
    phases = np.angle(stft)
    log_spectrogram = librosa.amplitude_to_db(spectrogram)
    normalized_spectrogram = (log_spectrogram - CLIP_STFT_MIN) / (CLIP_STFT_MAX - CLIP_STFT_MIN)
    batch = normalized_spectrogram.reshape(normalized_spectrogram.shape[0],
                                           normalized_spectrogram.shape[1] // batch_size, 
                                           batch_size)
    return batch

batch = get_random_batch()


Loading clip from offset: 55.18323405609419s
(256, 640)


In [28]:
import torch; torch.manual_seed(0)
import torch.nn as nn
import torch.nn.functional as F
import torch.utils
import torch.distributions
import torchvision
import numpy as np
import matplotlib.pyplot as plt; plt.rcParams['figure.dpi'] = 200
device = 'cuda' if torch.cuda.is_available() else 'cpu'


In [29]:
class Encoder(nn.Module):
    def __init__(self, latent_dims):
        super(Encoder, self).__init__()
        self.linear1 = nn.Linear(784, 512)
        self.linear2 = nn.Linear(512, latent_dims)

    def forward(self, x):
        x = torch.flatten(x, start_dim=1)
        x = F.relu(self.linear1(x))
        return self.linear2(x)