In [49]:
!pip install SpeechRecognition
!pip install noisereduce
!pip install pyAudioAnalysis
!pip install eyed3

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting eyed3
  Downloading eyed3-0.9.7-py3-none-any.whl (246 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m246.1/246.1 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting filetype<2.0.0,>=1.0.7
  Downloading filetype-1.2.0-py2.py3-none-any.whl (19 kB)
Collecting coverage[toml]<6.0.0,>=5.3.1
  Downloading coverage-5.5-cp39-cp39-manylinux2010_x86_64.whl (243 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m243.5/243.5 kB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting deprecation<3.0.0,>=2.1.0
  Downloading deprecation-2.1.0-py2.py3-n

In [51]:
pip install pydub

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [52]:
import os
import numpy as np
import librosa
import torch
import torch.nn as nn
import torch.optim as optim
import noisereduce as nr
import speech_recognition as sr
from torch.utils.data import Dataset, DataLoader
from musdb import DB
from pyAudioAnalysis import MidTermFeatures as aF
from pyAudioAnalysis import audioTrainTest as aT



In [38]:
# mus = musdb.DB(root="/path/to/musdb18")

In [30]:
# Define the hybrid transformer model
class HybridTransformer(nn.Module):
    def __init__(self, num_features, num_classes, num_layers=2):
        super(HybridTransformer, self).__init__()
        self.transformer_layers = nn.Transformer(num_features, num_classes, num_layers)
        self.fc = nn.Linear(num_features, num_classes)

    def forward(self, x):
        x = self.transformer_layers(x)
        x = self.fc(x)
        return x

In [31]:
class SpeechSeparationDataset(Dataset):
    def __init__(self, musdb18_root, subset='train'):
        self.musdb18 = musdb.DB(root=musdb18_root, subsets=subset)

    def __len__(self):
        return len(self.musdb18)

    def __getitem__(self, idx):
        track = self.musdb18[idx]
        vocals, _ = librosa.load(track.targets['vocals'].audio, sr=None, mono=True)
        accompaniment, _ = librosa.load(track.targets['accompaniment'].audio, sr=None, mono=True)
        return vocals, accompaniment

In [None]:
import os
import requests
import zipfile

def download_musdb18(download_url, destination_dir):
    response = requests.get(download_url, stream=True)
    zip_filename = os.path.join(destination_dir, "musdb18.zip")

    # Download the zip file
    with open(zip_filename, "wb") as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)

    # Extract the zip file
    with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
        zip_ref.extractall(destination_dir)

    # Remove the zip file
    os.remove(zip_filename)

    # Return the path to the extracted dataset
    musdb18_root = os.path.join(destination_dir, "musdb18")
    return musdb18_root

# Set the destination directory for the dataset
destination_dir = "/content"

# Download and extract the MUSDB18 dataset
download_url = "https://zenodo.org/record/1117372/files/musdb18.zip?download=1"
musdb18_root = download_musdb18(download_url, destination_dir)

print("MUSDB18 dataset downloaded and extracted to:", musdb18_root)


In [39]:

# Configuration
num_features = 128
num_classes = 2
num_layers = 2
num_epochs = 10
batch_size = 4
learning_rate = 0.001
musdb18_root = 'path/to/musdb18'

In [40]:
# Model and optimizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = HybridTransformer(num_features, num_classes, num_layers).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [41]:
# Data handling
train_dataset = SpeechSeparationDataset(musdb18_root, subset='train')
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = SpeechSeparationDataset(musdb18_root, subset='valid')
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


ValueError: ignored

In [None]:

# Speech separation and isolation
def separate_and_isolate(vocals, accompaniment):
    S_vocals = librosa.stft(vocals)
    S_accompaniment = librosa.stft(accompaniment)

    S_difference = S_vocals - S_accompaniment
    isolated_vocals = librosa.istft(S_difference)
    return isolated_vocals

In [None]:
# Audio denoising
def denoise_audio(audio, noise_factor=0.1):
    denoised_audio = nr.reduce_noise(audio_clip=audio, noise_clip=audio, prop_decrease=noise_factor)
    return denoised_audio

In [None]:
# Audio editing: example function to apply gain and speed changes
def edit_audio(audio, gain_dB=3.0, speed_factor=1.5):
    audio_amplified = librosa.effects.preemphasis(audio, coef=gain_dB)
    audio_speed_changed = librosa.effects.time_stretch(audio_amplified, speed_factor)
    return audio_speed_changed

In [None]:
# Speech recognition
def speech_to_text(audio, sample_rate, sample_width, language='en-US'):
    recognizer = sr.Recognizer()
    audio_data = sr.AudioData(audio.tobytes(), sample_rate, sample_width)
    text = recognizer.recognize_google(audio_data, language=language)
    return text

In [None]:
# Emotion detection
def detect_emotion(audio, sample_rate, model_name):
    # Extract features from the audio signal
    mid_window = int(0.050 * sample_rate)
    mid_step = int(0.025 * sample_rate)
    short_window = int(0.025 * sample_rate)
    short_step = int(0.010 * sample_rate)
    features, _, _ = aF.mid_feature_extraction(audio, sample_rate, mid_window, mid_step, short_window, short_step)

    # Load the pre-trained model
    model_path = f"{model_name}_model"
    model = aT.load_model_keras(model_path)

    # Predict emotion using the pre-trained model
    emotion_prediction = aT.classify_vector(features, model)
    return emotion_prediction


In [42]:
# Training loop
for epoch in range(num_epochs):
    model.train()
    for vocals, accompaniment in train_loader:
        optimizer.zero_grad()

        vocals = vocals.to(device)
        accompaniment = accompaniment.to(device)

        output = model(vocals)
        loss = criterion(output, accompaniment)

        loss.backward()
        optimizer.step()

    # Validation
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for vocals, accompaniment in val_loader:
            vocals = vocals.to(device)
            accompaniment = accompaniment.to(device)

            output = model(vocals)
            loss = criterion(output, accompaniment)
            val_loss += loss.item()

    val_loss /= len(val_loader)
    print(f'Epoch [{epoch+1}/{num_epochs}], Validation Loss: {val_loss:.4f}')

NameError: ignored

In [None]:
# Use the model for separation and isolation
model.eval()
with torch.no_grad():
    for vocals, accompaniment in val_loader:
        vocals = vocals.to(device)

        output = model(vocals)
        output = output.cpu().numpy()

        # Process audio
        isolated_vocals = separate_and_isolate(vocals.cpu().numpy(), output)
        denoised_audio = denoise_audio(isolated_vocals)
        edited_audio = edit_audio(denoised_audio)

        # Recognize speech and emotion
        sample_rate = 16000  # Set the appropriate sample rate for your audio
        sample_width = 2  # Set the appropriate sample width for your audio (2 bytes for 16-bit audio)
        text = speech_to_text(edited_audio, sample_rate, sample_width)
        model_name = 'your_model_name'  # Replace this with the name of your trained model or a pre-trained model
        emotion = detect_emotion(edited_audio, sample_rate, model_name)

        print("Transcript:", text)
        print("Emotion:", emotion)