In [17]:
!pip install pydub
!apt-get install ffmpeg


Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 19 not upgraded.


In [18]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [20]:
'/content/drive/MyDrive/Carl Sagan - Pale Blue Dot.mp3'

'/content/drive/MyDrive/Carl Sagan - Pale Blue Dot.mp3'

In [2]:
!pip install tensorflow



In [21]:
import os
import numpy as np
import librosa
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

In [22]:
# Step 1: Collect a dataset of audio files and their corresponding transcriptions
# For this example, we'll use the LibriSpeech dataset

# Step 2: Preprocess the audio files
def preprocess_audio(audio_path):
    # Load audio file
    audio, sr = librosa.load(audio_path, sr=16000)

    # Resample to 8kHz
    audio = librosa.resample(audio, sr, 8000)

    # Apply pre-emphasis filter
    pre_emphasis = 0.97
    emphasized_audio = np.append(audio[0], audio[1:] - pre_emphasis * audio[:-1])

    # Extract MFCC features
    mfcc = librosa.feature.mfcc(emphasized_audio, sr=8000, n_mfcc=13)

    # Pad or truncate to fixed length of 100 frames
    if mfcc.shape[1] < 100:
        pad_width = 100 - mfcc.shape[1]
        mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant')
    else:
        mfcc = mfcc[:, :100]

    return mfcc

In [23]:
# Step 3: Extract features from the audio files
def extract_features(audio_dir):
    features = []
    labels = []
    for root, dirs, files in os.walk(audio_dir):
        for file in files:
            if file.endswith('.flac'):
                audio_path = os.path.join(root, file)
                label_path = audio_path.replace('.flac', '.txt')
                with open(label_path, 'r') as f:
                    label = f.read().strip()
                features.append(preprocess_audio(audio_path))
                labels.append(label)
    return np.array(features), np.array(labels)


In [24]:
# Step 4: Split the dataset into training, validation, and testing sets
def split_dataset(features, labels):
    num_samples = len(features)
    indices = np.arange(num_samples)
    np.random.shuffle(indices)
    train_indices = indices[:int(num_samples * 0.8)]
    val_indices = indices[int(num_samples * 0.8):int(num_samples * 0.9)]
    test_indices = indices[int(num_samples * 0.9):]
    x_train, y_train = features[train_indices], labels[train_indices]
    x_val, y_val = features[val_indices], labels[val_indices]
    x_test, y_test = features[test_indices], labels[test_indices]
    return x_train, y_train, x_val, y_val, x_test, y_test

In [25]:
# Step 5: Train a CNN model on the training set
def train_model(x_train, y_train, x_val, y_val):
    model = Sequential()
    model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(13, 100, 1), padding='same'))  # Add 'padding' parameter
    model.add(MaxPooling2D((2, 2)))
    model.add(Conv2D(64, (3, 3), activation='relu', padding='same'))  # Add 'padding' parameter
    model.add(MaxPooling2D((2, 2)))
    model.add(Conv2D(128, (3, 3), activation='relu', padding='same'))  # Add 'padding' parameter
    model.add(MaxPooling2D((2, 2)))
    model.add(Flatten())
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(29, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    model.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_val, y_val))
    return model

In [26]:
# Step 6: Evaluate the performance of the model on the testing set
def evaluate_model(model, x_test, y_test):
    loss, accuracy = model.evaluate(x_test, y_test)
    print('Test loss:', loss)
    print('Test accuracy:', accuracy)


In [27]:
# Step 7: Use the trained model to transcribe new audio files into text files
def transcribe_audio(audio_path, model):
    mfcc = preprocess_audio(audio_path)
    mfcc = np.expand_dims(mfcc, axis=-1)
    mfcc = np.expand_dims(mfcc, axis=0)
    prediction = model.predict(mfcc)
    transcription = ''.join([chr(np.argmax(prediction[0][i]) + 97) for i in range(prediction.shape[1])])
    return transcription


In [None]:
# Run the above functions
features, labels = extract_features('/content/drive/MyDrive/Carl Sagan - Pale Blue Dot.mp3')
x_train, y_train, x_val, y_val, x_test, y_test = split_dataset(features, labels)
model = train_model(x_train, y_train, x_val, y_val)
evaluate_model(model, x_test, y_test)
transcription = transcribe_audio('/content/drive/MyDrive/Carl Sagan - Pale Blue Dot.mp3', model)
print(transcription)