In [1]:
import pandas as pd
import urllib.request
import os
import sys
import librosa
import soundfile as sf
import time
from pydub import AudioSegment

In [2]:
CSV_FILE_PATH = "../../bio_metadata.csv"

In [3]:
class GetAudio:
    def __init__(
        self, 
        csv_filepath, 
        destination_folder= 'audio/', 
        wait= 1.5, 
        debug=True 
    ):
        '''
            Initializes GetAudio class object
            :param destination_folder (str): Folder where audio files will be saved
            :param wait (float): Length (in seconds) between web requests
            :param debug (bool): Outputs status indicators to console when True
        '''
        self.csv_filepath = csv_filepath
        self.audio_df = pd.read_csv(csv_filepath)
        self.url = 'http://accent.gmu.edu/soundtracks/{}.mp3'
        self.destination_folder = os.path.join("../../data", destination_folder) 
        self.wait = wait
        self.debug = debug

    def check_path(self):
        """
            Checks if self.destination_folder exists. If not, a folder called self.destination_folder is created
        """
        folder_path = os.path.join("../../data", self.destination_folder)  # Combine data and destination paths
        if not os.path.exists(folder_path):
            if self.debug:
                print(f"{folder_path} does not exist, creating")
            os.makedirs(folder_path)  # Create the folder if it doesn't exist


    def get_audio(self):
        '''
            Retrieves all audio files from 'language_num' column of self.audio_df
            If audio file already exists, move on to the next
            :return (int): Number of audio files downloaded
        '''
        self.check_path()
        counter = 0
        for lang_num in self.audio_df['language_num']:
            # file_path = os.path.exists(self.destination_folder +'{}.wav'.format(lang_num))
            # print(file_path)
            # if file already exists, move on to the next
            if os.path.exists(self.destination_folder +'{}.wav'.format(lang_num)):
                if self.debug:
                    print('File Already here {}'.format(lang_num))
                continue
            if not os.path.exists(self.destination_folder +'{}.wav'.format(lang_num)):
                if self.debug:
                    print('downloading {}'.format(lang_num))
                (filename, headers) = urllib.request.urlretrieve(self.url.format(lang_num))
                sound = AudioSegment.from_mp3(filename)
                sound.export(self.destination_folder + "{}.wav".format(lang_num), format="wav")
                counter += 1

        return counter


In [4]:
csv_file = CSV_FILE_PATH
ga = GetAudio(csv_filepath=csv_file)
ga.get_audio()

File Already here mandarin1
File Already here mandarin2
File Already here mandarin3
File Already here mandarin4
File Already here mandarin5
File Already here mandarin6
File Already here mandarin7
File Already here mandarin8
File Already here mandarin9
File Already here mandarin10
File Already here mandarin11
File Already here mandarin12
File Already here mandarin13
File Already here mandarin14
File Already here mandarin15
File Already here mandarin16
File Already here mandarin17
File Already here mandarin18
File Already here mandarin19
File Already here mandarin20
File Already here mandarin21
File Already here mandarin22
File Already here mandarin23
File Already here mandarin24
File Already here mandarin25
File Already here mandarin26
File Already here mandarin27
File Already here mandarin28
File Already here mandarin29
File Already here mandarin30
File Already here mandarin31
File Already here mandarin32
File Already here mandarin33
File Already here mandarin34
File Already here manda

393