In [3]:
import os
import librosa
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import sounddevice as sd
from scipy.io.wavfile import write

def load_audio_files(directory):
    audio_files = []
    for filename in os.listdir(directory):
        if filename.endswith(".mp3") or filename.endswith(".wav"):
            path = os.path.join(directory, filename)
            audio_files.append(path)
    return audio_files

def audio_to_spectrogram(audio_path, max_pad_len=174):
    y, sr = librosa.load(audio_path, duration=30)  # Load only the first 30 seconds
    if len(y) < sr * 30:
        y = np.pad(y, (0, max(sr * 30 - len(y), 0)), mode='constant')
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000)
    S_dB = librosa.power_to_db(S, ref=np.max)
    if S_dB.shape[1] > max_pad_len:
        S_dB = S_dB[:, :max_pad_len]
    else:
        S_dB = np.pad(S_dB, ((0, 0), (0, max_pad_len - S_dB.shape[1])), 'constant')
    return S_dB

def preprocess_spectrogram(S_dB):
    S_dB = (S_dB - np.min(S_dB)) / (np.max(S_dB) - np.min(S_dB))
    return S_dB.flatten()

def load_and_preprocess_audio(directory):
    audio_files = load_audio_files(directory)
    labels = [os.path.splitext(os.path.basename(file))[0] for file in audio_files]
    spectrograms = [audio_to_spectrogram(file) for file in audio_files]
    preprocessed = [preprocess_spectrogram(spectrogram) for spectrogram in spectrograms]
    return audio_files, labels, preprocessed

def find_closest_match(input_spectrogram, training_spectrograms, labels):
    input_preprocessed = preprocess_spectrogram(input_spectrogram)
    similarities = cosine_similarity([input_preprocessed], training_spectrograms)
    max_index = np.argmax(similarities)
    return labels[max_index], similarities[0][max_index]

def record_audio(duration=10, samplerate=22050):
    print("Press Enter to start recording for {} seconds...".format(duration))
    input("Press Enter to start recording:")  # User prompt to start recording
    recording = sd.rec(int(duration * samplerate), samplerate=samplerate, channels=2)
    sd.wait()  # Wait until recording is finished
    output_filename = "recording.wav"
    write(output_filename, samplerate, recording)  # Save as WAV file
    print("Recording finished.")
    return output_filename

# Main functionality
def main():
    training_directory = "C:\\Users\\Plaksha\\Desktop\\Sem 6\\Deep Learning\\AALets do it viks\\Data"
    training_files, training_labels, training_spectrograms = load_and_preprocess_audio(training_directory)
    
    input_audio_path = record_audio()
    input_spectrogram = audio_to_spectrogram(input_audio_path)
    
    closest_match_label, similarity_score = find_closest_match(input_spectrogram, training_spectrograms, training_labels)
    print(f"The closest match is: {closest_match_label} with a similarity score of {similarity_score:.2f}")

if __name__ == "__main__":
    main()


Press Enter to start recording for 10 seconds...
Press Enter to start recording:
Recording finished.
The closest match is: 000615 with a similarity score of 0.86


In [7]:
import os
import librosa
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
import sounddevice as sd
from scipy.io.wavfile import write
import pandas as pd

def load_audio_files(directory, duration=20, sr=22050):
    labels = []
    spectrograms = []
    for filename in os.listdir(directory):
        if filename.endswith(".mp3") or filename.endswith(".wav"):
            path = os.path.join(directory, filename)
            audio, _ = librosa.load(path, sr=sr, duration=duration)
            audio = librosa.util.fix_length(audio, size=sr*duration)
            S = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=128)
            S_dB = librosa.power_to_db(S, ref=np.max)
            spectrograms.append(S_dB)
            labels.append(os.path.splitext(filename)[0])
    return np.array(spectrograms), np.array(labels)

def build_cnn_model(input_shape, num_classes):
    model = models.Sequential([
        layers.Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(64, (3, 3), activation='relu'),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(128, (3, 3), activation='relu'),
        layers.Flatten(),
        layers.Dropout(0.5),
        layers.Dense(128, activation='relu'),
        layers.Dense(num_classes, activation='softmax')  # Ensure softmax for multi-class classification
    ])
    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',  # Correct loss function
                  metrics=['accuracy'])
    return model

def record_audio(filename='temp_recording.wav', duration=20, sr=22050):
    print("Press Enter to start recording for {} seconds...")
    input("Press Enter to start recording:")
    recording = sd.rec(int(duration * sr), samplerate=sr, channels=2)
    sd.wait()  # Wait until recording is finished
    write(filename, sr, recording)  # Save as WAV file
    print("Recording finished.")
    return filename

directory = "C:\\Users\\Plaksha\\Desktop\\Sem 6\\Deep Learning\\AALets do it viks\\Data"
spectrograms, labels = load_audio_files(directory)
labels, uniques = pd.factorize(labels)

# Normalize spectrograms
spectrograms = spectrograms / np.max(spectrograms)

# Prepare data for training
X_train, X_test, y_train, y_test = train_test_split(spectrograms[..., np.newaxis], labels, test_size=0.2, random_state=42)

# Build and train the CNN model
input_shape = X_train.shape[1:]
num_classes = len(np.unique(labels))
model = build_cnn_model(input_shape, num_classes)
model.fit(X_train, y_train, epochs=20, validation_data=(X_test, y_test))

def predict_top_songs(audio_path, top_k=3):
    audio, _ = librosa.load(audio_path, sr=22050, duration=20)
    audio = librosa.util.fix_length(audio, size=22050*20)
    S = librosa.feature.melspectrogram(y=audio, sr=22050, n_mels=128)
    S_dB = librosa.power_to_db(S, ref=np.max)
    S_dB = S_dB[np.newaxis, ..., np.newaxis] / np.max(S_dB)
    predictions = model.predict(S_dB)
    top_indices = np.argsort(predictions[0])[-top_k:][::-1]
    return [uniques[i] for i in top_indices]

# Record and predict
input_audio_path = record_audio()
top_songs = predict_top_songs(input_audio_path)
print("Top Predicted Songs:", top_songs)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Press Enter to start recording for {} seconds...
Press Enter to start recording:
Recording finished.
Top Predicted Songs: ['001039', '001684', '000546']
