# Transciribing Audio with OpenAI's Whisper

In [25]:
from jiwer import wer, cer
import pandas as pd
import whisper
import os
import csv
from gtts import gTTS
from IPython.display import Audio

In [2]:
file_path = r'D:\AI-ML\Speech Recognition\Files\speech_01.wav'

In [3]:
model = whisper.load_model("base", device="cpu")

In [11]:
result = model.transcribe(file_path)

transcribed_text_whisper = result["text"]

print(transcribed_text_whisper)

 My name is Yvonne and I am excited to have you as part of our learning community. Before we get started, I'd like to tell you a little bit about myself. I'm a sound engineer turned a scientist, curious about machine learning and artificial intelligence. My professional background is primarily in media production, with a focus on audio, IT and communications.


In [5]:
result["language"]

'en'

In [6]:
ground_truth = """My name is Ivan and I am excited to have you as part of our learning community! 
Before we get started, I’d like to tell you a little bit about myself. I’m a sound engineer turned data scientist,
curious about machine learning and Artificial Intelligence. My professional background is primarily in media production,
with a focus on audio, IT, and communications"""

In [10]:
calculated_wer = wer(ground_truth, transcribed_text_whisper)
calculated_cer = cer(ground_truth, transcribed_text_whisper)
print(f"Word Error Rate (WER): {calculated_wer:.4f}")
print(f"Character Error Rate (CER): {calculated_cer:.4f}")

Word Error Rate (WER): 0.2203
Character Error Rate (CER): 0.0470


# Transcribing Multiple Audio Files from a Directory

In [12]:
directory_path = r"D:\AI-ML\Speech Recognition\Files\WAV files"

In [16]:
def transcribe_directory_whisper(directory_path):
    transcriptions = []
    for file_name in os.listdir(directory_path):
        if file_name.endswith(".wav"):
            files_path = os.path.join(directory_path, file_name)
            # Transcribe the audio file
            result = model.transcribe(files_path)
            transcription = result["text"]
            transcriptions.append({"file_name": file_name, "transcription": transcription})
    return transcriptions

In [17]:
transcriptions = transcribe_directory_whisper(directory_path)

In [18]:
transcriptions

[{'file_name': 'Track1.wav',
  'transcription': " I'm a sound engineer turned data scientist. Curious about machine learning and artificial intelligence."},
 {'file_name': 'Track2.wav',
  'transcription': ' My professional background is primarily in media production with a focus on audio, IT, and communications.'},
 {'file_name': 'Track3.wav',
  'transcription': " Over the years, I've developed a strong interest in digital signal processing sound and music computing."},
 {'file_name': 'Track4.wav',
  'transcription': ' As a graduate of Sound Engineering, I make it a priority to strike a balance between art and technology and my work.'},
 {'file_name': 'Track5.wav',
  'transcription': ' I believe that nowadays, data is the key to everything.'},
 {'file_name': 'Track6.wav',
  'transcription': ' Not only can it provide a rational explanation for complicated scientific puzzles.'},
 {'file_name': 'Track7.wav',
  'transcription': ' But it can also give you efficient methodologies for solving

# Saving Audio Transcriptions to CSV for Easy Analysis

In [20]:
output_file = "transcriptions.csv"

with open(output_file, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Track Number", "File Name", "Transcription"])  # Write the header
    for number, transcription in enumerate(transcriptions, start=1):
        writer.writerow([number, transcription['file_name'], transcription['transcription']])

In [22]:
data = pd.read_csv("transcriptions.csv")

data.head()

Unnamed: 0,Track Number,File Name,Transcription
0,1,Track1.wav,I'm a sound engineer turned data scientist. C...
1,2,Track2.wav,My professional background is primarily in me...
2,3,Track3.wav,"Over the years, I've developed a strong inter..."
3,4,Track4.wav,"As a graduate of Sound Engineering, I make it..."
4,5,Track5.wav,"I believe that nowadays, data is the key to e..."


# Text-to-Speech

In [30]:
text = """My name is Suraj, I am studying in Sri Jayachamarajendra College of Engineering"""

tts = gTTS(text=text, lang='en')
tts.save("output.mp3")

Audio('output.mp3')