In [1]:
!pip install resampy


[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: C:\Users\Shyamsundhar\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip




In [22]:
import numpy as np
import librosa
import os
import sounddevice as sd
import scipy.io.wavfile as wav
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping

# Tamil Uyir letters
uyir_letters = ['அ', 'ஆ', 'இ', 'ஈ', 'உ', 'ஊ', 'எ', 'ஏ', 'ஐ', 'ஒ', 'ஓ', 'ஔ']

# Function to extract features from an audio file
def extract_features(file_path, max_pad_len=174):
    audio, sample_rate = librosa.load(file_path, res_type='kaiser_fast') 
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
    pad_width = max_pad_len - mfccs.shape[1]
    if pad_width > 0:
        mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
    else:
        mfccs = mfccs[:, :max_pad_len]
    return mfccs

# Function to load dataset
def load_dataset(dataset_path):
    features = []
    labels = []
    
    for letter in uyir_letters:
        letter_folder = os.path.join(dataset_path, letter)
        for file_name in os.listdir(letter_folder):
            if file_name.endswith('.wav'):
                file_path = os.path.join(letter_folder, file_name)
                mfccs = extract_features(file_path)
                features.append(mfccs)
                labels.append(uyir_letters.index(letter))

    features = np.array(features)
    labels = np.array(labels)
    labels = to_categorical(labels, num_classes=len(uyir_letters))
    
    return features, labels

# Function to create the neural network
def create_model(input_shape, num_classes):
    model = Sequential()
    model.add(Dense(256, input_shape=input_shape, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

# Function to train the model
def train_model(model, X_train, y_train, X_val, y_val, epochs=50, batch_size=32):
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    history = model.fit(X_train, y_train, validation_data=(X_val, y_val), 
                        epochs=epochs, batch_size=batch_size, callbacks=[early_stopping])
    return model

# Function to record a voice and save as wav
def record_voice(filename, duration=3, fs=16000):
    print("Recording...")
    recording = sd.rec(int(duration * fs), samplerate=fs, channels=1)
    sd.wait()  # Wait until the recording is finished
    wav.write(filename, fs, recording)
    print("Recording finished")

# Function to recognize Tamil Uyir letter from recorded voice
def recognize_uyir_letter(model, filename):
    mfccs = extract_features(filename)
    mfccs = np.expand_dims(mfccs, axis=0)  # Add batch dimension
    mfccs = mfccs.reshape(1, -1)  # Flatten to match model input shape
    prediction = model.predict(mfccs)
    predicted_letter_index = np.argmax(prediction)
    return uyir_letters[predicted_letter_index]


# Main execution
file_path = "./audio_dataset"
dataset_path = "./audio_dataset"
X, y = load_dataset(dataset_path)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Reshape X_train and X_val to fit the input for Dense layers
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1] * X_train.shape[2])
X_val = X_val.reshape(X_val.shape[0], X_val.shape[1] * X_val.shape[2])

input_shape = (X_train.shape[1],)
model = create_model(input_shape, len(uyir_letters))

# Train the model
model = train_model(model, X_train, y_train, X_val, y_val)
# After training the model
model.save('uyir_letters_model.h5')  # Save the model to a file

# Record a voice and recognize
#voice_filename = 'recorded_voice.wav'
#record_voice(voice_filename, duration=3)
#recognized_letter = recognize_uyir_letter(model, voice_filename)

#print(f"Recognized Tamil Uyir letter: {recognized_letter}")


Epoch 1/50
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 29ms/step - accuracy: 0.1239 - loss: 142.4839 - val_accuracy: 0.1231 - val_loss: 19.2155
Epoch 2/50
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.1228 - loss: 33.9660 - val_accuracy: 0.1231 - val_loss: 3.2170
Epoch 3/50
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 34ms/step - accuracy: 0.0814 - loss: 4.4861 - val_accuracy: 0.0923 - val_loss: 3.4731
Epoch 4/50
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 35ms/step - accuracy: 0.0935 - loss: 4.3389 - val_accuracy: 0.1385 - val_loss: 2.5253
Epoch 5/50
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 35ms/step - accuracy: 0.0810 - loss: 2.9375 - val_accuracy: 0.1154 - val_loss: 2.9272
Epoch 6/50
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 35ms/step - accuracy: 0.0855 - loss: 3.0236 - val_accuracy: 0.1308 - val_loss: 2.3938
Epoch 7/50
[1m17/17[0m [32m



In [27]:
from keras.models import load_model

# Load the saved model
model = load_model('uyir_letters_model.h5')

# Record and predict
voice_filename = 'recorded_voice.wav'
record_voice(voice_filename, duration=3)
recognized_letter = recognize_uyir_letter(model, voice_filename)

print(f"Recognized Tamil Uyir letter: {recognized_letter}")




Recording...
Recording finished
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
Recognized Tamil Uyir letter: அ
