In [1]:
# Voice-To-Voice AI machine learning model by Faith Villarreal and Ricky Zapata


In [2]:
##Here is starter code that I made


In [3]:
import librosa
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib
import matplotlib.pyplot as plt
import librosa.display

# Function to extract features from an audio file
def extract_features(audio_file):
    audio_data, sample_rate = librosa.load(audio_file, sr=44100)
    
    # Extract MFCC (Mel Frequency Cepstral Coefficients)
    mfcc = librosa.feature.mfcc(y=audio_data, sr=sample_rate, n_mfcc=13)
    
    # Extract Spectrogram
    spectrogram = librosa.feature.melspectrogram(y=audio_data, sr=sample_rate)
    
    # Average the MFCC features and flatten the spectrogram
    mfcc_mean = np.mean(mfcc, axis=1)
    spectrogram_flatten = np.mean(spectrogram, axis=1)
    
    # Combine features into one array
    combined_features = np.hstack((mfcc_mean, spectrogram_flatten))
    
    return combined_features

# Function to load dataset and extract features
def load_dataset(dataset_path):
    X = []
    y = []
    
    # Assuming dataset folder structure as 'dataset_path/class_name/audio_files'
    for class_name in os.listdir(dataset_path):
        class_folder = os.path.join(dataset_path, class_name)
        
        for audio_file in os.listdir(class_folder):
            file_path = os.path.join(class_folder, audio_file)
            features = extract_features(file_path)
            X.append(features)
            y.append(class_name)  # The class_name corresponds to the vocal chain component label
            
    return np.array(X), np.array(y)

# Train the model
def train_model(X_train, y_train):
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)
    return clf

# Predict using the trained model
def predict_vocal_chain(model, audio_file):
    features = extract_features(audio_file)
    prediction = model.predict([features])
    return prediction[0]

# Visualize the Spectrogram for a given audio file
def visualize_spectrogram(audio_file):
    audio_data, sample_rate = librosa.load(audio_file, sr=44100)
    spectrogram = librosa.feature.melspectrogram(y=audio_data, sr=sample_rate)
    
    plt.figure(figsize=(10, 4))
    librosa.display.specshow(librosa.power_to_db(spectrogram, ref=np.max), y_axis='mel', x_axis='time')
    plt.title('Mel spectrogram')
    plt.colorbar(format='%+2.0f dB')
    plt.show()

if __name__ == "__main__":
    # Load the dataset (Replace with the actual path to your dataset)
    dataset_path = 'path_to_dataset'
    X, y = load_dataset(dataset_path)

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train the model
    print("Training the model...")
    model = train_model(X_train, y_train)

    # Save the model to a file
    joblib.dump(model, 'vocal_chain_model.pkl')
    print("Model training complete and saved as 'vocal_chain_model.pkl'.")

    # Test the model
    print("Evaluating the model...")
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Test Accuracy: {accuracy * 100:.2f}%")

    # Prompt the user to input a new audio file for prediction
    while True:
        audio_file = input("Enter the path to an audio file for vocal chain prediction (or type 'exit' to quit): ")
        if audio_file.lower() == 'exit':
            break
        
        try:
            # Predict the vocal chain component(s)
            prediction = predict_vocal_chain(model, audio_file)
            print(f"Predicted vocal chain component: {prediction}")

            # Optionally visualize the spectrogram
            visualize_spectrogram(audio_file)

        except Exception as e:
            print(f"Error processing file: {e}")


FileNotFoundError: [Errno 2] No such file or directory: 'path_to_dataset'