<a href="https://colab.research.google.com/github/Ogweno/portfolio/blob/main/naiive_feature_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
import os
import librosa
import numpy as np
import pandas as pd
from scipy.spatial.distance import cosine
from tabulate import tabulate


In [None]:
# Mount Google Drive to access files
drive.mount('/content/drive')

Mounted at /content/drive


a Python script that utilizes the naive feature extraction for audio files using the Mel-frequency cepstral coefficients (MFCC) library in librosa:

In [None]:
def extract_features(audio_path):
    try:
        # Load audio file
        y, sr = librosa.load(audio_path, sr=None)

        # Extract MFCC features
        mfccs = librosa.feature.mfcc(y=y, sr=sr)

        # Compute mean and standard deviation across MFCC coefficients
        mfccs_mean = np.mean(mfccs, axis=1)
        mfccs_std = np.std(mfccs, axis=1)

        # Concatenate mean and standard deviation to form feature vector
        feature_vector = np.concatenate((mfccs_mean, mfccs_std))

        return feature_vector

    except Exception as e:
        print("Error encountered while parsing file:", audio_path)
        return None

def process_directory(directory):
    features = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.wav'):
                file_path = os.path.join(root, file)
                feature_vector = extract_features(file_path)
                if feature_vector is not None:
                    features.append(feature_vector)
    return features



In [None]:
def compute_mfcc(audio_path):
    y, sr = librosa.load(audio_path)
    y = y - np.mean(y)
    return librosa.feature.mfcc(y=y, sr=sr)

def compute_similarities1(filepaths, targetfilepath):
    mfcc_vectors = [compute_mfcc(path) for path in filepaths]
    target_vectors = compute_mfcc(targetfilepath)
    avg_mfcc_vectors = [np.mean(mfcc, axis=1) for mfcc in mfcc_vectors]
    target_avg_mfcc_vectors = np.mean(target_vectors, axis=1)
    similarities = [1 - cosine(target_avg_mfcc_vectors, vec2) for vec2 in avg_mfcc_vectors]

    # Loop through filepaths by index and associate the same index from similarities list
    namedSimilarities = {os.path.basename(filepaths[i]): similarities[i] for i in range(len(filepaths))}

    # Sort by descending similarity:
    namedSimilarities = dict(sorted(namedSimilarities.items(), key=lambda item: item[1], reverse=True))
    return namedSimilarities




In [None]:
# Specify the directory containing audio files
audio_directory = '/content/drive/MyDrive/2024 - unfoldingWord Hackathon/'

# Process audio files and extract features
extracted_features = process_directory(audio_directory)

# Get all audio file paths
audio_files = []
for root, dirs, files in os.walk(audio_directory):
    for file in files:
        if file.endswith('.wav'):
            audio_files.append(os.path.join(root, file))

# Print the extracted features with file names
#for file_path, feature_vector in zip(audio_files, extracted_features):
#    file_name = os.path.basename(file_path)
#    print('---------------------------')
#    print(f"Features for audio file {file_name}: {feature_vector}")

# Prepare data for tabulate
header = [os.path.basename(file_path) for file_path in audio_files]
table_data = []
for i, feature_values in enumerate(extracted_features):
    table_data.append([i] + feature_values)

# Transpose the table
transposed_table_data = list(map(list, zip(*table_data)))

# Print transposed table
print("\nExtracted Features (Transposed):")
print(tabulate(transposed_table_data, headers=header, tablefmt="grid", showindex=True))


Extracted Features (Transposed):
+----+------------------------+------------------------+-----------------------+--------------+--------------+-------------------+-------------+-------------------+------------+-----------------------+-------------+
|    |   InTheBeginning01.wav |   InTheBeginning02.wav |   inTheBeginning2.wav |   jesus1.wav |   jesus2.wav |   wilderness1.wav |    God1.wav |   wilderness2.wav |   God2.wav |   inTheBeginning1.wav |    God3.wav |
|  0 |            -559.004    |            -468.601    |           -420.525    |  -431.576    |  -428.537    |       -433.228    | -512.456    |        -451.253   | -514.064   |           -417.237    | -299.132    |
+----+------------------------+------------------------+-----------------------+--------------+--------------+-------------------+-------------+-------------------+------------+-----------------------+-------------+
|  1 |             109.518    |              74.0679   |             54.1791   |     1.95082  |    21.

In [None]:
#filePrefix = "/content/drive/MyDrive/2024 - unfoldingWord Hackathon/"

# Get a list of all WAV files in the folder
folder_path = '/content/drive/MyDrive/2024 - unfoldingWord Hackathon/'
file_paths = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith('.wav')]

# Add file prefix to each file path
files = [path for path in file_paths]

target_file_path = '/content/drive/MyDrive/2024 - unfoldingWord Hackathon/God3.wav'  # Example target file path
similarities = compute_similarities1(files, target_file_path)
#print(similarities)

#print("\nSimilar to " + target_file_path + ":")
# Print similarities in this format: "filename: similarity"
#for key, value in similarities.items():
#    print(f"{key}: {value}")

print("\nSimilarities to " + target_file_path + ":")
# Prepare data for tabulate
table_data = [(key, value) for key, value in similarities.items()]

# Print table
print(tabulate(table_data, headers=["Filename", "Similarity"], tablefmt="grid"))



Similarities to /content/drive/MyDrive/2024 - unfoldingWord Hackathon/God3.wav:
+----------------------+--------------+
| Filename             |   Similarity |
| God3.wav             |     1        |
+----------------------+--------------+
| inTheBeginning1.wav  |     0.912822 |
+----------------------+--------------+
| wilderness1.wav      |     0.900347 |
+----------------------+--------------+
| InTheBeginning01.wav |     0.899718 |
+----------------------+--------------+
| wilderness2.wav      |     0.899063 |
+----------------------+--------------+
| inTheBeginning2.wav  |     0.895474 |
+----------------------+--------------+
| God1.wav             |     0.889791 |
+----------------------+--------------+
| God2.wav             |     0.884701 |
+----------------------+--------------+
| InTheBeginning02.wav |     0.884566 |
+----------------------+--------------+
| jesus2.wav           |     0.870224 |
+----------------------+--------------+
| jesus1.wav           |     0.855348 |