In [11]:
!pip install librosa tensorflow

Collecting librosa
  Using cached librosa-0.10.2.post1-py3-none-any.whl.metadata (8.6 kB)


ERROR: Could not find a version that satisfies the requirement tensorflow (from versions: none)
ERROR: No matching distribution found for tensorflow


In [2]:
import os
import librosa
import numpy as np
from tensorflow.keras import layers, models
from sklearn.preprocessing import LabelEncoder

ModuleNotFoundError: No module named 'librosa'

In [None]:
# Define the CMKD (CNN + Transformer) model
def create_cmkd_model(input_shape, num_classes):
    inputs = layers.Input(shape=input_shape)

    # CNN part
    x = layers.Conv2D(32, (3, 3), activation='relu', padding='same')(inputs)
    x = layers.MaxPooling2D(pool_size=(2, 2))(x)
    x = layers.Conv2D(64, (3, 3), activation='relu', padding='same')(x)
    x = layers.MaxPooling2D(pool_size=(2, 2))(x)
    x = layers.Flatten()(x)

    # Transformer part (Encoder Layer)
    transformer_input = layers.Reshape((input_shape[0], -1))(inputs)
    transformer = layers.MultiHeadAttention(num_heads=4, key_dim=64)(transformer_input, transformer_input)
    transformer = layers.GlobalAveragePooling1D()(transformer)

    # Knowledge Distillation (fusion of CNN and Transformer features)
    merged = layers.Concatenate()([x, transformer])

    # Fully connected layers
    dense = layers.Dense(128, activation='relu')(merged)
    outputs = layers.Dense(num_classes, activation='softmax')(dense)

    model = models.Model(inputs, outputs)
    return model

In [None]:
# Load audio files and extract features
def load_audio_files(directory, max_length=128):
    labels = []
    features = []

    for filename in os.listdir(directory):
        if filename.endswith(".wav"):
            print("Processing file:", filename)
            # Load the audio file
            file_path = os.path.join(directory, filename)
            audio, sr = librosa.load(file_path, sr=None)

            # Extract Mel-spectrogram
            mel_spec = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=128)
            mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)

            # Normalize the spectrogram
            mel_spec_db = (mel_spec_db - np.min(mel_spec_db)) / (np.max(mel_spec_db) - np.min(mel_spec_db))

            # Truncate or pad the spectrogram
            if mel_spec_db.shape[1] > max_length:
                mel_spec_db = mel_spec_db[:, :max_length]  # Truncate to max_length
            elif mel_spec_db.shape[1] < max_length:
                padding = np.zeros((128, max_length - mel_spec_db.shape[1]))  # Padding with zeros
                mel_spec_db = np.hstack((mel_spec_db, padding))

            # Get label (character) from the filename (e.g., 0.wav -> '0', a.wav -> 'a')
            label = filename.split('.')[0]
            labels.append(label)
            features.append(mel_spec_db)

    return np.array(features), np.array(labels)

In [None]:
# Load training data
train_data_dir = '/content/drive/MyDrive/Project I/training_data/'
X_train, y_train = load_audio_files(train_data_dir)

# Encode labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

# Create the model
input_shape = (128, 128, 1)  # Adjust based on your spectrogram dimensions
num_classes = len(np.unique(y_train_encoded))  # Number of unique classes
cmkd_model = create_cmkd_model(input_shape, num_classes)

# Compile the model
cmkd_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Print model summary
cmkd_model.summary()

In [None]:
# Reshape X_train to include the channel dimension if not already done
if len(X_train.shape) == 3:  # If missing the channel dimension
    X_train = X_train.reshape(X_train.shape[0], 128, 128, 1)

# Set batch size and calculate steps per epoch
batch_size = 32
steps_per_epoch = len(X_train) // batch_size  # Calculate steps per epoch

# Train the model
history = cmkd_model.fit(X_train, y_train_encoded, epochs=150, batch_size=batch_size, steps_per_epoch=steps_per_epoch, verbose=1)

In [3]:
# Function to load the audio file
def load_audio(file_path):
    audio, sample_rate = librosa.load(file_path, sr=None)
    return audio, sample_rate

# Detect regions of continuous sound and pauses between them
def detect_sound_regions(audio, sample_rate, silence_threshold=0.01, min_pause_duration=0.8):
    rms = librosa.feature.rms(y=audio)[0]
    frame_duration = 512 / sample_rate
    min_pause_frames = int(min_pause_duration / frame_duration)

    sound_regions = []
    is_silence = rms < silence_threshold

    start = None
    for i in range(len(is_silence)):
        if not is_silence[i] and start is None:
            start = i  # Sound started
        elif is_silence[i] and start is not None:
            if np.all(is_silence[i:i + min_pause_frames]):
                sound_regions.append((start, i))
                start = None

    return sound_regions

In [4]:
# Split audio by detected sound regions and save each region as a letter
def split_and_save_letters(audio, sample_rate, sound_regions, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    file_count = 1  # Counter for the letter file names

    for region in sound_regions:
        start_sample = region[0] * 512  # Convert RMS index to sample index
        end_sample = region[1] * 512

        letter_audio = audio[start_sample:end_sample]
        segment_filename = os.path.join(output_folder, f'letter{file_count}.wav')

        # Save the letter audio as a .wav file
        sf.write(segment_filename, letter_audio, sample_rate)
        file_count += 1

    print(f"Saved {file_count - 1} letter files to {output_folder}")

# Main process: load audio, detect sound regions, and split into letters
def process_word_audio(input_file_path, output_folder):
    audio, sample_rate = load_audio(input_file_path)
    sound_regions = detect_sound_regions(audio, sample_rate)
    split_and_save_letters(audio, sample_rate, sound_regions, output_folder)

In [5]:
# Extract mel features from segmented files
def extract_mel_features(file_path, max_length=128):
    audio, sample_rate = librosa.load(file_path)
    mel_spec = librosa.feature.melspectrogram(y=audio, sr=sample_rate, n_mels=128)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)

    # Normalize the spectrogram
    mel_spec_db = (mel_spec_db - np.min(mel_spec_db)) / (np.max(mel_spec_db) - np.min(mel_spec_db))

    # Truncate or pad the spectrogram
    if mel_spec_db.shape[1] > max_length:
        mel_spec_db = mel_spec_db[:, :max_length]
    elif mel_spec_db.shape[1] < max_length:
        padding = np.zeros((128, max_length - mel_spec_db.shape[1]))
        mel_spec_db = np.hstack((mel_spec_db, padding))

    return mel_spec_db

# Extract features from all segmented files
def extract_features_from_segmented(output_folder):
    features = []
    file_names = []

    for file_name in os.listdir(output_folder):
        if file_name.endswith('.wav'):
            file_path = os.path.join(output_folder, file_name)
            mel_features = extract_mel_features(file_path)
            features.append(mel_features)
            file_names.append(file_name)

    return np.array(features), file_names

In [6]:
# Define paths for audio processing
INPUT_FILE_PATH = '/content/drive/MyDrive/Project I/testing_data/HELLO.wav'  # Path to the input audio file
OUTPUT_FOLDER = './letters/'  # Folder to store the individual letter files

# Run the process for the given audio file
process_word_audio(INPUT_FILE_PATH, OUTPUT_FOLDER)

NameError: name 'librosa' is not defined

In [7]:
# Call the function to extract features
X_segmented, file_names_segmented = extract_features_from_segmented(OUTPUT_FOLDER)

# Reshape for model input
X_segmented = X_segmented.reshape(X_segmented.shape[0], 128, 128, 1)

# Predict characters from segmented features
predictions = cmkd_model.predict(X_segmented)

# Convert predictions to classes
predicted_classes = np.argmax(predictions, axis=1)

# Map predictions back to original characters
decoded_labels = label_encoder.inverse_transform(predicted_classes)

# Print the results
for file_name, predicted_label in zip(file_names_segmented, decoded_labels):
    print(f"{file_name} predicted class: {predicted_label}")

FileNotFoundError: [WinError 3] The system cannot find the path specified: './letters/'