In [None]:
# Install necessary libraries
!pip install librosa pydub tensorflow



In [None]:
# Importing necessary libraries
import librosa
import os
import numpy as np

# Example of loading an audio file (MP3, WAV, etc.)
def load_audio(file_path):
    audio, sr = librosa.load(file_path, sr=16000)  # 16kHz is typical for speech
    return audio, sr

# Example file path
audio_file = '/content/audio.mp3'
audio_data, sample_rate = load_audio(audio_file)


In [None]:
# Preprocess the audio into MFCCs
def extract_mfcc(audio_data, sample_rate):
    mfccs = librosa.feature.mfcc(y=audio_data, sr=sample_rate, n_mfcc=13)
    return mfccs

# Extract MFCCs from the audio file
mfccs = extract_mfcc(audio_data, sample_rate)

# Display MFCC shape
print(mfccs.shape)


(13, 95)


In [None]:
from pydub import AudioSegment

# Convert MP3/OPUS to WAV format
def convert_to_wav(input_file, output_file):
    audio = AudioSegment.from_file(input_file)
    audio.export(output_file, format="wav")

# Example for converting an MP3 file to WAV
# Removed the leading space from the file paths:
mp3_file = '/content/audio.mp3'
# Changed the output file name to 'audio.wav':
wav_file = '/content/audio.wav'
convert_to_wav(mp3_file, wav_file)

# Now load the WAV file using librosa
audio_data, sample_rate = load_audio(wav_file)

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models

# Define the model
def build_model(input_shape):
    model = models.Sequential()
    model.add(layers.Input(shape=input_shape))
    model.add(layers.LSTM(128, return_sequences=True))
    model.add(layers.LSTM(128))
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(1, activation='softmax'))  # For multi-class classification (text output)

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# # Build the model
# model = build_model(input_shape=(None, 13))  # 13 MFCCs
# model.summary()

In [None]:
!pip install jiwer




In [None]:
import jiwer

# Sample predicted and true text
predicted_text = "hello world"
true_text = "hello"

wer = jiwer.wer(true_text, predicted_text)
print(f"Word Error Rate (WER): {wer}")

Word Error Rate (WER): 1.0


In [None]:
# def predict_speech(model, audio_file):
#     audio_data, sample_rate = load_audio(audio_file)
#     mfccs = extract_mfcc(audio_data, sample_rate)
#     # Reshape MFCCs to (number of frames, number of MFCC coefficients)
#     mfccs = mfccs.T  # Transpose if necessary to get (num_frames, 13) shape

#     # Assuming your model expects a 3D input (batch_size, timesteps, features)
#     mfccs = mfccs[np.newaxis, ...]  # Add batch dimension
#     predicted_text = model.predict(mfccs)
#     return predicted_text


In [None]:
model.save('/content/model.h5')



In [None]:
import librosa
import numpy as np
from tensorflow.keras.models import load_model

In [None]:
model_path = '/content/model.h5'  # Path to your saved .h5 model
model = load_model(model_path)
print("Model loaded successfully.")



Model loaded successfully.


In [None]:
def preprocess_audio(audio_file, sr=16000, n_mfcc=13, max_len=125):

    # Load the audio file
    audio, _ = librosa.load(audio_file, sr=sr)

    # Extract MFCC features
    mfccs = librosa.feature.mfcc(audio, sr=sr, n_mfcc=n_mfcc)

    # Pad or truncate the MFCCs to match max_len
    if mfccs.shape[1] > max_len:
        mfccs = mfccs[:, :max_len]
    else:
        pad_width = max_len - mfccs.shape[1]
        mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')

    # Transpose to match model's input shape (timesteps, features)
    mfccs = mfccs.T

    # Add batch dimension
    return np.expand_dims(mfccs, axis=0)

In [None]:
def predict_speech(model, audio_file):

    # Preprocess the audio file
    processed_audio = preprocess_audio(audio_file)

    # Get predictions
    prediction = model.predict(processed_audio)

    # Decode the prediction (modify based on your model's output format)
    predicted_text = np.argmax(prediction, axis=1)[0]  # Example decoding for classification
    return predicted_text

In [None]:
audio_file = '/content/audio.wav'  # Path to your audio file
try:
    predicted_text = predict_speech(model, audio_file)
    print("Predicted text:", predicted_text)
except FileNotFoundError as e:
    print(f"Audio file not found: {e}")
except Exception as e:
    print(f"An error occurred during prediction: {e}")

An error occurred during prediction: mfcc() takes 0 positional arguments but 1 positional argument (and 2 keyword-only arguments) were given


In [None]:
!pip install openai-whisper
!pip install torch



Collecting openai-whisper
  Downloading openai-whisper-20240930.tar.gz (800 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/800.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/800.5 kB[0m [31m6.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tiktoken (from openai-whisper)
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting triton>=2.0.0 (from openai-whisper)
  Downloading triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.3 kB)
Downloading triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylin

In [None]:
import whisper

# Load the pretrained Whisper model
model = whisper.load_model("base")
print("Whisper model loaded successfully.")

# Function to transcribe audio
def transcribe_audio(audio_file):
    try:
        result = model.transcribe(audio_file, language="en")
        return result["text"]
    except Exception as e:
        print(f"Error during transcription: {e}")
        return None

# Path to your audio file
audio_file = '/content/hello.waptt'

# Transcribe audio
transcribed_text = transcribe_audio(audio_file)
if transcribed_text:
    print("Transcribed text:", transcribed_text)
else:
    print("Failed to transcribe audio.")


  checkpoint = torch.load(fp, map_location=device)


Whisper model loaded successfully.




Transcribed text:  Hello, how are you?
