In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
%pip install librosa pydub speechrecognition --quiet

In [None]:
import os
import librosa
import speech_recognition as sr
from pydub import AudioSegment
import logging
import shutil
import IPython
from IPython import display as ipd

In [None]:
"""Creating the audio files from the dataset: DAIC-WOZ"""

data_dir = '/kaggle/input/daic-woz'

audio_dir = '/kaggle/working/audio_files'
os.makedirs(audio_dir, exist_ok = True)

folders = [entry.name for entry in os.scandir(data_dir) if entry.is_dir()]
audio_files = []

for folder in folders:
    folder = os.path.join(data_dir, folder)

    audio_file = [entry.name for entry in os.scandir(folder) if os.path.splitext(entry.name)[1] == '.wav']
    audio_file = os.path.join(folder, audio_file[0])
    audio_files.append(audio_file)
    if os.path.exists(audio_file):
        shutil.copy(audio_file, os.path.join(audio_dir, os.path.basename(audio_file)))
        print("Sucessfully Moved")
    else:
        print("Some Error Occured")

In [None]:
TRANSCRIPT_DIR = "/kaggle/working/transcripts"
os.makedirs(TRANSCRIPT_DIR, exist_ok=True)

In [None]:
#Using Speech Recognition System to get the speech to text
audio_dir = '/kaggle/working/audio_files'
TRANSCRIPT_DIR = "/kaggle/working/transcripts"
class SpeechToText:
    def __init__(self, audio_dir=audio_dir, transcript_dir=TRANSCRIPT_DIR):
        """Initialize the SpeechToText converter."""
        self.recognizer = sr.Recognizer()
        self.audio_dir = audio_dir
        self.transcript_dir = transcript_dir
    
    def convert_file(self, audio_file_path, api="google"):
        """Convert a single audio file to text."""

        try:
            # Check if file is WAV format, convert if not
            if not audio_file_path.lower().endswith('.wav'):
                audio_file_path = self._convert_to_wav(audio_file_path)
                print("I am here")
            
            with sr.AudioFile(audio_file_path) as source:
                audio_data = self.recognizer.record(source)
                
                if api == "google":
                    print("I am here in google.")
                    text = self.recognizer.recognize_google(audio_data)
                elif api == "sphinx":
                    text = self.recognizer.recognize_sphinx(audio_data)
                else:
                    print("API Not Supported")
                    return None
                    
                print(f"Transcription successful: {text[:50]}...")
                return text
                
        except Exception as e:
            return None
    
    def _convert_to_wav(self, audio_file_path):
        """Convert audio file to WAV format."""
        try:
            file_name = os.path.basename(audio_file_path)
            name, _ = os.path.splitext(file_name)
            wav_path = os.path.join(self.audio_dir, f"{name}.wav")
            
            audio = AudioSegment.from_file(audio_file_path)
            audio = audio.set_channels(1)  # Convert to mono
            audio = audio.set_frame_rate(16000)  # Set sample rate to 16kHz
            audio.export(wav_path, format="wav")
            
           
            return wav_path
        except Exception as e:
            raise
    
    def process_directory(self, api="google"):
        """Process all audio files in the audio directory."""
        transcripts = {}
        
        for filename in os.listdir(self.audio_dir):
            if filename.endswith(('.wav', '.mp3', '.flac', '.ogg')):
                filepath = os.path.join(self.audio_dir, filename)
                print("Processing", filepath)
                
                # Get transcript
                transcript = self.convert_file(filepath)
                
                if transcript:
                    # Save transcript to file
                    name, _ = os.path.splitext(filename)
                    transcript_path = os.path.join(self.transcript_dir, f"{name}.txt")
                    
                    with open(transcript_path, 'w') as f:
                        f.write(transcript)
                    
                    transcripts[name] = transcript
        
        return transcripts

In [None]:
stt = SpeechToText()
transcripts = stt.process_directory(audio_dir)
print(transcripts)