In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import speech_recognition as sr
from enum import Enum

In [None]:
# dataset, info = tfds.load('speech_commands', split=['train', 'validation', 'test'], with_info=True)


# Load dataset
dataset, info = tfds.load(
    'speech_commands',
    split=['train', 'validation', 'test'],
    with_info=True,
    as_supervised=True
)
print(info)



In [None]:
def preprocess_data(dataset):
    texts = []
    labels = []
    
    for example in dataset:
        # Audio file in the form of a tensor (waveform)
        audio = example['audio']
        label = example['label']
        
        # Convert audio to text using pre-defined mapping or classifier (optional)
        texts.append(str(label.numpy()))  # Here we use the label as the "language" (i.e., command)
        labels.append(label.numpy())
        
    return np.array(texts), np.array(labels)

In [None]:
train_texts, train_labels = preprocess_data(dataset[0])  # Train dataset
val_texts, val_labels = preprocess_data(dataset[1])  # Validation dataset
test_texts, test_labels = preprocess_data(dataset[2])  # Test dataset

In [None]:
cv = CountVectorizer()
X_train = cv.fit_transform(train_texts)
X_val = cv.transform(val_texts)
X_test = cv.transform(test_texts)


In [None]:
model = MultinomialNB()
model.fit(X_train, train_labels)


In [None]:
accuracy = model.score(X_val, val_labels)
print(f"Model Accuracy: {accuracy:.2f}")

In [None]:
class Language(Enum):
    ENGLISH = "en"

# SpeechToText class for recording live audio input and converting it to text
class SpeechToText:
    @staticmethod
    def speech_to_text(device_index=1, language=Language.ENGLISH):
        r = sr.Recognizer()
        with sr.Microphone(device_index=device_index) as source:
            print("Recording...")
            audio = r.listen(source)
            print("Recording Complete...")
            try:
                # Transcribe audio to text using Google's Speech Recognition API
                text = r.recognize_google(audio, language=language.value)
                print(f"Transcribed Text ({language.name}):", text)
                return text
            except sr.UnknownValueError:
                print("Could not understand audio")
                return None
            except sr.RequestError as e:
                print(f"Request error from Google Speech Recognition service: {e}")
                return None

In [None]:
def predict_language_from_audio(device_index=1, language=Language.ENGLISH):
    # Get the audio input and convert it to text
    transcribed_text = SpeechToText.speech_to_text(device_index, language)
    
    if transcribed_text:
        # Vectorize the transcribed text
        data = cv.transform([transcribed_text]).toarray()
        
        # Predict the language (command) using the trained model
        output = model.predict(data)
        print("Predicted Language:", output)


In [None]:
if __name__ == "__main__":
    # Predict language for a sample audio input (recorded live from the microphone)
    device_index = 1  # Set device index based on available microphones
    language = Language.ENGLISH  # Set language to English for recognition
    predict_language_from_audio(device_index=device_index, language=language)