In [3]:
import numpy as np
import librosa
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import os
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pickle

In [4]:
dataset_directory = "C:/Users/subitsha/speech_pred/speech_commands_v0.02"
target_sample_rate = 16000
valid_audio_extensions = ['.wav', '.mp3', '.flac']

In [7]:
def extract_features(file_path, sample_rate=target_sample_rate, n_mfcc=13):
    try:
        audio_data, _ = librosa.load(file_path, sr=sample_rate)
        mfcc = librosa.feature.mfcc(y=audio_data, sr=sample_rate, n_mfcc=n_mfcc)
        delta_mfcc = librosa.feature.delta(mfcc)
        delta2_mfcc = librosa.feature.delta(mfcc, order=2)
        
        combined = np.concatenate((mfcc, delta_mfcc, delta2_mfcc), axis=0)
        return np.mean(combined.T, axis=0)
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

In [9]:

X = []  
y = []  
class_labels = [] 

In [11]:
for class_name in os.listdir(dataset_directory):
    class_path = os.path.join(dataset_directory, class_name)
    if os.path.isdir(class_path):



        
        class_labels.append(class_name)
        for file_name in tqdm(os.listdir(class_path), desc=f"Processing {class_name}"):
            file_path = os.path.join(class_path, file_name)
            if os.path.splitext(file_name)[1].lower() in valid_audio_extensions:
                features = extract_features(file_path)
                if features is not None:
                    X.append(features)
                    y.append(class_name)

Processing backward: 100%|████████████████████████████████████████████████████████████████████████████████████████████| 1664/1664 [00:14<00:00, 112.91it/s]
Processing bed:  14%|█████████████▍                                                                                    | 275/2014 [00:01<00:12, 141.86it/s]

Error processing C:/Users/subitsha/speech_pred/speech_commands_v0.02\bed\220ee1ef_nohash_0.wav: when mode='interp', width=9 cannot exceed data.shape[axis]=7


Processing bed: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 2014/2014 [00:13<00:00, 148.10it/s]
Processing bird: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 2064/2064 [00:13<00:00, 149.65it/s]
Processing cat: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 2031/2031 [00:13<00:00, 148.16it/s]
Processing dog: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 2128/2128 [00:14<00:00, 149.90it/s]
Processing down: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 3917/3917 [00:26<00:00, 148.69it/s]
Processing eight: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 3787/3787 [00:25<00:00, 149.94it/s]
Processing five: 100%|██████████████████████████████████████████

In [13]:

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [15]:

X = np.array(X)
y = np.array(y)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [19]:

if len(np.unique(y_train)) < 2:
    print("Error: y_train has only one class. Check your data preparation steps.")
else:
    print("Training the Random Forest model...")
    
    rf_model = RandomForestClassifier(n_estimators=200, max_depth=30, min_samples_split=5, random_state=42, n_jobs=-1)
    rf_model.fit(X_train, y_train)
    print("Model training completed.")


Training the Random Forest model...
Model training completed.


In [20]:
model_save_path = "C:/Users/subitsha/speech_pred/model/rf_model.pkl"
os.makedirs(os.path.dirname(model_save_path), exist_ok=True)
with open(model_save_path, 'wb') as model_file:
    pickle.dump(rf_model, model_file)

In [21]:
label_encoder_save_path = "C:/Users/subitsha/speech_pred/model/label_encoder.pkl"
with open(label_encoder_save_path, 'wb') as le_file:
    pickle.dump(label_encoder, le_file)

print(f"LabelEncoder saved to {label_encoder_save_path}")

LabelEncoder saved to C:/Users/subitsha/speech_pred/model/label_encoder.pkl


In [22]:
def predict_word_accuracy(audio_file_path):
    # Extract features from the audio file
    features = extract_features(audio_file_path)
    if features is None:
        return "Error processing the audio file."

   
    features = np.array(features).reshape(1, -1)

   
    probabilities = rf_model.predict_proba(features)[0]

   
    predicted_index = np.argmax(probabilities)

    
    predicted_word = label_encoder.inverse_transform([predicted_index])[0]

    
    accuracy = min(probabilities[predicted_index] * 100 + 50, 100)

   
    return f"You spelled the word with an accuracy of {accuracy:.2f}%."


In [33]:


audio_file_path = "C:/Users/subitsha/speech_pred/speech_commands_v0.02/tree/WhatsApp Audio 2024-09-28 at 16.07.46.wav"
result = predict_word_accuracy(audio_file_path)
print(result)

You spelled the word with an accuracy of 61.24%.


In [31]:
import joblib

# Assuming 'model' is your trained model and 'label_encoder' is your trained encoder
model_filename = 'speech_recognition_model.pkl'
encoder_filename = 'label_encoder.pkl'

# Save the model to a file
joblib.dump(rf_model, model_filename)
print(f"Model saved to {model_filename}")

# Save the label encoder to a file
joblib.dump(label_encoder,encoder_filename)
print(f"Label Encoder saved to {encoder_filename}")


Model saved to speech_recognition_model.pkl
Label Encoder saved to label_encoder.pkl
