In [32]:
from pydub import AudioSegment
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [33]:
def mp3_to_vectors(mp3_path, segment_length_ms=3000):
    # Load the MP3 file
    audio = AudioSegment.from_mp3(mp3_path)

    # Split the audio into one-minute segments
    segments = [audio[i:i+segment_length_ms] for i in range(0, len(audio), segment_length_ms)]

    # Process each segment into vectors
    vectors = []
    for segment in segments[:-1]:
        # Export segment to a temporary WAV file
        segment.export("temp.wav", format="wav")

        # Load the segment using librosa
        y, sr = librosa.load("temp.wav", sr=None)

        # Extract features (e.g., MFCCs)
        mfcc = librosa.feature.mfcc(y=y, sr=sr)

        # Flatten the mfccs
        mfcc = mfcc.flatten()
        
        # Append the features to the vectors list
        vectors.append(mfcc)

    return np.array(vectors)


In [34]:
# Loading data from mp3 files

def create_dataset(speaker_path, nonspeaker_path):

    vectors_class_0 = mp3_to_vectors(nonspeaker_path)
    vectors_class_1 = mp3_to_vectors(speaker_path)

    labels_class_0 = np.zeros(len(vectors_class_0))
    labels_class_1 = np.ones(len(vectors_class_1))

    vectors_unshuffled = np.concatenate((vectors_class_0, vectors_class_1))
    labels_unshuffled = np.concatenate((labels_class_0, labels_class_1))

    indices = np.arange(len(vectors_unshuffled))
    np.random.shuffle(indices)
    vectors = vectors_unshuffled[indices]
    labels = labels_unshuffled[indices]

    return (vectors, labels)

In [35]:
def model_and_accuracy(vectors, labels):
    X_train, X_test, y_train, y_test = train_test_split(vectors, labels, test_size = 0.2, random_state = 23)
    classifier = SVC()
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    return (classifier, accuracy)

In [42]:
# Experiment 1 
# With the dummy dataset (both same classes)
speaker_path = "recording_test.mp3"
nonspeaker_path = "recording_test.mp3"

vectors, labels = create_dataset(speaker_path, nonspeaker_path)
model, accuracy = model_and_accuracy(vectors, labels)

print(accuracy)


0.375


In [44]:
# Experiment 2
# With 2 obviously different songs
speaker_path = "celticwoman.mp3"
nonspeaker_path = "gladyoureviltoo.mp3"

vectors, labels = create_dataset(speaker_path, nonspeaker_path)
model, accuracy = model_and_accuracy(vectors, labels)

print(accuracy)

1.0


In [51]:
# Experiment 3
# Two audio clips of the same song, one recorded on speaker, one recorded without speaker

speaker_path = "with_speaker_youtube_video.mp3"
nonspeaker_path = "no_speaker_youtube_video.mp3"

vectors, labels = create_dataset(speaker_path, nonspeaker_path)
model, accuracy = model_and_accuracy(vectors, labels)

print(accuracy)

0.75
