In [36]:
import os
import numpy as np
import librosa
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
import resampy

In [37]:
data_dir = "dataset/Sinhala _words/"
class_labels = os.listdir(data_dir)
num_classes = len(class_labels)
print(num_classes)
print(class_labels)

32
['aba', 'aliya', 'amma', 'ara', 'ata', 'balanna', 'balla', 'basaya', 'bathala', 'dara', 'gaga', 'gala', 'gasa', 'hada', 'jalaya', 'jambu', 'kaju', 'kalaya', 'kana', 'kata', 'kathura', 'lamaya', 'mal', 'mala', 'nasaya', 'nayaa', 'pahana', 'pata', 'takarama', 'tayaraya', 'Tharuwa', 'yathura']


In [38]:
def extract_features(file_path, mfcc=True, chroma=True, mel=True):
    audio, sample_rate = librosa.load(file_path, res_type='kaiser_fast')
    features = []
    if mfcc:
        mfccs = np.mean(librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=13), axis=1)
        features.extend(mfccs)
    if chroma:
        chroma = np.mean(librosa.feature.chroma_stft(y=audio, sr=sample_rate), axis=1)
        features.extend(chroma)
    if mel:
        mel = np.mean(librosa.feature.melspectrogram(y=audio, sr=sample_rate), axis=1)
        features.extend(mel)
    return features

In [39]:
X, y = [], []
for label in class_labels:
    label_dir = os.path.join(data_dir, label)
    for filename in os.listdir(label_dir):
        file_path = os.path.join(label_dir, filename)
        features = extract_features(file_path)
        X.append(features)
        y.append(class_labels.index(label))

X = np.array(X)
y = np.array(y)

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [43]:
# Step 4: Build the Model
model = keras.Sequential([
    keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(num_classes, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Step 5: Model Training
model.fit(X_train, y_train, epochs=50, validation_data=(X_val, y_val))

# Step 6: Model Evaluation
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test accuracy: {test_acc}")

Epoch 1/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 239ms/step - accuracy: 0.0275 - loss: 21.3830 - val_accuracy: 0.0606 - val_loss: 16.3154
Epoch 2/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.0459 - loss: 11.3330 - val_accuracy: 0.0606 - val_loss: 9.6584
Epoch 3/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.1127 - loss: 8.4317 - val_accuracy: 0.0000e+00 - val_loss: 5.8669
Epoch 4/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.1139 - loss: 5.8656 - val_accuracy: 0.0000e+00 - val_loss: 6.9456
Epoch 5/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.1541 - loss: 4.5070 - val_accuracy: 0.0909 - val_loss: 7.9131
Epoch 6/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.1785 - loss: 4.0134 - val_accuracy: 0.0909 - val_loss: 7.1266
Epoch 7/50
[1m5/5[0m [32m━━━━━━

In [41]:
new_audio_features = extract_features('sample2.wav')
new_audio_features = np.array(new_audio_features).reshape(1, -1)
prediction = model.predict(new_audio_features)
predicted_class = class_labels[np.argmax(prediction)]
print(f"Predicted class: {predicted_class}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 231ms/step
Predicted class: aba


In [45]:
from pydub import AudioSegment

# Load the full audio file
full_audio = AudioSegment.from_file("gasaata.wav")

# Define segment duration (in milliseconds)
segment_duration = 1000  # Adjust as needed

# Initialize variables to store segment features and predictions
segment_features_list = []
segment_predictions = []

# Segment the full audio and make predictions
for start_time in range(0, len(full_audio), segment_duration):
    end_time = start_time + segment_duration
    audio_segment = full_audio[start_time:end_time]

    # Convert the audio segment to a file (adjust format and path as needed)
    segment_path = "temp_segment.wav"
    audio_segment.export(segment_path, format="wav")

    # Extract features from the segment
    segment_features = extract_features(segment_path)

    # Preprocess features (e.g., reshape or normalize)
    segment_features = np.array(segment_features).reshape(1, -1)

    # Make predictions using your trained model
    prediction = model.predict(segment_features)
    predicted_class = class_labels[np.argmax(prediction)]

    # Store the features and predictions
    segment_features_list.append(segment_features)
    segment_predictions.append(predicted_class)

    # Optional: Remove the temporary segment file
    os.remove(segment_path)

# Now you have segment features and predictions in the order of the segments
print("Segment Predictions:", segment_predictions)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
Segment Predictions: ['balanna', 'ara', 'gaga']
