In [1]:
import warnings

from diffusers.configuration_utils import ConfigMixin, register_to_config
from diffusers.schedulers.scheduling_utils import SchedulerMixin

warnings.filterwarnings("ignore")

import numpy as np  # noqa: E402
from PIL import Image  # noqa: E402

In [2]:
try:
    import librosa  # noqa: E402

    _librosa_can_be_imported = True
    _import_error = ""
except Exception as e:
    _librosa_can_be_imported = False
    _import_error = (
        f"Cannot import librosa because {e}. Make sure to correctly install librosa to be able to install it."
    )

In [3]:

class Mel(ConfigMixin, SchedulerMixin):
    """
    Parameters:
        x_res (`int`): x resolution of spectrogram (time)
        y_res (`int`): y resolution of spectrogram (frequency bins)
        sample_rate (`int`): sample rate of audio
        n_fft (`int`): number of Fast Fourier Transforms
        hop_length (`int`): hop length (a higher number is recommended for lower than 256 y_res)
        top_db (`int`): loudest in decibels
        n_iter (`int`): number of iterations for Griffin Linn mel inversion
    """

    config_name = "mel_config.json"

    @register_to_config
    def __init__(
        self,
        x_res: int = 256,
        y_res: int = 256,
        sample_rate: int = 22050,
        n_fft: int = 2048,
        hop_length: int = 512,
        top_db: int = 80,
        n_iter: int = 32,
    ):
        self.hop_length = hop_length
        self.sr = sample_rate
        self.n_fft = n_fft
        self.top_db = top_db
        self.n_iter = n_iter
        self.set_resolution(x_res, y_res)
        self.audio = None

        if not _librosa_can_be_imported:
            raise ValueError(_import_error)

    def set_resolution(self, x_res: int, y_res: int):
        """Set resolution.

        Args:
            x_res (`int`): x resolution of spectrogram (time)
            y_res (`int`): y resolution of spectrogram (frequency bins)
        """
        self.x_res = x_res
        self.y_res = y_res
        self.n_mels = self.y_res
        self.slice_size = self.x_res * self.hop_length - 1

    def load_audio(self, audio_file: str = None, raw_audio: np.ndarray = None):
        """Load audio.

        Args:
            audio_file (`str`): must be a file on disk due to Librosa limitation or
            raw_audio (`np.ndarray`): audio as numpy array
        """
        if audio_file is not None:
            self.audio, _ = librosa.load(audio_file, mono=True, sr=self.sr)
        else:
            self.audio = raw_audio

        # Pad with silence if necessary.
        if len(self.audio) < self.x_res * self.hop_length:
            self.audio = np.concatenate([self.audio, np.zeros((self.x_res * self.hop_length - len(self.audio),))])

    def get_number_of_slices(self) -> int:
        """Get number of slices in audio.

        Returns:
            `int`: number of spectograms audio can be sliced into
        """
        return len(self.audio) // self.slice_size

    def get_audio_slice(self, slice: int = 0) -> np.ndarray:
        """Get slice of audio.

        Args:
            slice (`int`): slice number of audio (out of get_number_of_slices())

        Returns:
            `np.ndarray`: audio as numpy array
        """
        return self.audio[self.slice_size * slice : self.slice_size * (slice + 1)]

    def get_sample_rate(self) -> int:
        """Get sample rate:

        Returns:
            `int`: sample rate of audio
        """
        return self.sr

    def audio_slice_to_image(self, slice: int, ref=np.max) -> Image.Image:
        """Convert slice of audio to spectrogram.

        Args:
            slice (`int`): slice number of audio to convert (out of get_number_of_slices())

        Returns:
            `PIL Image`: grayscale image of x_res x y_res
        """
        S = librosa.feature.melspectrogram(
            y=self.get_audio_slice(slice), sr=self.sr, n_fft=self.n_fft, hop_length=self.hop_length, n_mels=self.n_mels
        )
        log_S = librosa.power_to_db(S, ref=ref, top_db=self.top_db)
        bytedata = (((log_S + self.top_db) * 255 / self.top_db).clip(0, 255) + 0.5).astype(np.uint8)
        image = Image.fromarray(bytedata)
        return image

    def image_to_audio(self, image: Image.Image) -> np.ndarray:
        """Converts spectrogram to audio.

        Args:
            image (`PIL Image`): x_res x y_res grayscale image

        Returns:
            audio (`np.ndarray`): raw audio
        """
        bytedata = np.frombuffer(image.tobytes(), dtype="uint8").reshape((image.height, image.width))
        log_S = bytedata.astype("float") * self.top_db / 255 - self.top_db
        S = librosa.db_to_power(log_S)
        audio = librosa.feature.inverse.mel_to_audio(
            S, sr=self.sr, n_fft=self.n_fft, hop_length=self.hop_length, n_iter=self.n_iter
        )
        return audio

In [None]:
# Step 1: Create an instance of Mel
mel = Mel()

# Step 2: Load the audio file
audio_file = "C:/VS code projects/Orpheus-2/audio/nvg.wav"
mel.load_audio(audio_file)

# Step 3: Determine the number of slices
num_slices = mel.get_number_of_slices()

# Step 4: Convert each slice to an image
images = []
for i in range(num_slices):
    img = mel.audio_slice_to_image(i)
    images.append(img)

In [7]:
images = []
# Step 1: Create an instance of Mel
mel = Mel()

In [17]:
# Step 2: Load the audio file
audio_file = "C:/VS code projects/Orpheus-2/downloads/001311.mp3"
mel.load_audio(audio_file)

# Step 3: Determine the number of slices
num_slices = mel.get_number_of_slices()
# Step 4: Convert each slice to an image
img = mel.audio_slice_to_image(3)
images.append(img)

In [38]:
images[9].save("C:/VS code projects/Orpheus-2/downloads/9.png")

In [53]:
from scipy.io import wavfile
import soundfile as sf


In [57]:
# Set the sample rate (e.g., 44100)
sample_rate = 22050

# Save the audio array as a temporary WAV file
temp_wav_file = "temp.wav"
wavfile.write(temp_wav_file, sample_rate, mel.image_to_audio(images[3]))

# Set the output MP3 file path
output_mp3_file = "audio/output.mp3"

# Load the temporary WAV file
wav_data, sr = sf.read(temp_wav_file)

# Convert the WAV data to MP3 format
sf.write(output_mp3_file, wav_data, sample_rate, format="MP3")



In [1]:
pip install psycopg2

Note: you may need to restart the kernel to use updated packages.


In [51]:
import psycopg2
from io import BytesIO

# Connect to the PostgreSQL database
conn = psycopg2.connect(database="orpheus", user="postgres", password="1234", host="localhost", port="5432")
cur = conn.cursor()

# Assuming you have a table named 'images' with columns 'id' (serial primary key) and 'image_data' (bytea)
table_name = "songs"

# Convert PIL.Image.Image object to bytes
image = Image.open("C:/VS code projects/Orpheus-2/downloads/0.png")  # Replace with your actual Image object
image_bytes = BytesIO()
image.save(image_bytes, format="PNG")
image_bytes = image_bytes.getvalue()

# Insert the image data into the database
cur.execute(f"INSERT INTO {table_name} (song,id) VALUES (%s,%s)", (image_bytes,10))

# Commit the changes and close the database connection
conn.commit()
cur.close()
conn.close()


In [20]:
import psycopg2
from io import BytesIO

# Connect to the PostgreSQL database
conn = psycopg2.connect(database="orpheus", user="postgres", password="1234", host="localhost", port="5432")
cur = conn.cursor()

# Assuming you have a table named 'images' with columns 'id' (serial primary key) and 'image_data' (bytea)
table_name = "music"
image_id = 18  # Replace with the actual ID of the image you want to retrieve

# Retrieve the image data from the database
cur.execute(f"SELECT songs FROM {table_name} WHERE encoding = %s", (image_id,))
result = cur.fetchone()

# Convert the bytea data to PIL.Image.Image object
image_bytes = BytesIO(result[0])
image = Image.open(image_bytes)

# Close the database connection
cur.close()
conn.close()

# Use the image as needed
image.show()
