In [3]:
from pydub import AudioSegment
import os

def extract_first_30_seconds(input_file: str, output_file: str):
    """
    Extracts the first 30 seconds from a .wav file and saves it as a new .wav file.

    Args:
        input_file (str): Path to the input .wav file.
        output_file (str): Path to save the output .wav file.
    """
    try:
        # Load the audio file
        audio = AudioSegment.from_wav(input_file)

        # Get the first 30 seconds (30 * 1000 milliseconds)
        first_30_seconds = audio[:30 * 1000]

        # Export the audio
        first_30_seconds.export(output_file, format="wav")
        print(f"Saved the first 30 seconds to {output_file}")

    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage:
input_wav_file = os.path.join(os.getcwd(), 'data', 'bird_dataset', 'Geothlypis trichas_Common Yellowthroat', 'XC105134.wav')
output_wav_file = os.path.join(os.getcwd(), 'data', 'bird_dataset', 'Geothlypis trichas_Common Yellowthroat', 'short_XC105134.wav')
extract_first_30_seconds(input_wav_file, output_wav_file)


Saved the first 30 seconds to /Users/solomongreene/DIS/DIS Core Course/data/bird_dataset/Geothlypis trichas_Common Yellowthroat/short_XC105134.wav


In [5]:
import pandas as pd
from pathlib import Path
import os

# Directory where the bird dataset is stored
XC_ROOTDIR = './data/'
XC_DIR = 'bird_dataset'

# Initialize an empty list to store file details
filelist = []

# Walk through the directories of species
species_dir = os.path.join(os.getcwd(), XC_ROOTDIR, XC_DIR)
for species_folder in os.listdir(species_dir):
    species_path = os.path.join(species_dir, species_folder)
    if os.path.isdir(species_path):  # Ensure it's a directory
        for file_name in os.listdir(species_path):
            if file_name.endswith('.wav'):  # Filter for .wav files
                filelist.append({
                    'fullfilename': os.path.join(species_path, file_name),
                    'filename': Path(file_name).stem,
                    'species': species_folder  # The folder name is the species
                })

# Create a DataFrame with the gathered file details
df_files_wav = pd.DataFrame(filelist)

# Display a summary of the DataFrame
print('=====================================================')
print(f'Number of files: {len(df_files_wav)}')
print(f'Number of species: {len(df_files_wav["species"].unique())}')
print('=====================================================')

# Print the first few rows of the DataFrame to verify
print(df_files_wav.head())

Number of files: 1000
Number of species: 20
                                        fullfilename  filename  \
0  /Users/solomongreene/DIS/DIS Core Course/./dat...  XC721665   
1  /Users/solomongreene/DIS/DIS Core Course/./dat...  XC691616   
2  /Users/solomongreene/DIS/DIS Core Course/./dat...  XC140016   
3  /Users/solomongreene/DIS/DIS Core Course/./dat...  XC133365   
4  /Users/solomongreene/DIS/DIS Core Course/./dat...  XC624508   

                               species  
0  Setophaga magnolia_Magnolia Warbler  
1  Setophaga magnolia_Magnolia Warbler  
2  Setophaga magnolia_Magnolia Warbler  
3  Setophaga magnolia_Magnolia Warbler  
4  Setophaga magnolia_Magnolia Warbler  


In [7]:
df_files_wav['commonname'] = df_files_wav['species'].apply(lambda x: x.split('_')[1])

In [8]:
df_files_wav

Unnamed: 0,fullfilename,filename,species,commonname
0,/Users/solomongreene/DIS/DIS Core Course/./dat...,XC721665,Setophaga magnolia_Magnolia Warbler,Magnolia Warbler
1,/Users/solomongreene/DIS/DIS Core Course/./dat...,XC691616,Setophaga magnolia_Magnolia Warbler,Magnolia Warbler
2,/Users/solomongreene/DIS/DIS Core Course/./dat...,XC140016,Setophaga magnolia_Magnolia Warbler,Magnolia Warbler
3,/Users/solomongreene/DIS/DIS Core Course/./dat...,XC133365,Setophaga magnolia_Magnolia Warbler,Magnolia Warbler
4,/Users/solomongreene/DIS/DIS Core Course/./dat...,XC624508,Setophaga magnolia_Magnolia Warbler,Magnolia Warbler
...,...,...,...,...
995,/Users/solomongreene/DIS/DIS Core Course/./dat...,XC447423,Geothlypis trichas_Common Yellowthroat,Common Yellowthroat
996,/Users/solomongreene/DIS/DIS Core Course/./dat...,XC556704,Geothlypis trichas_Common Yellowthroat,Common Yellowthroat
997,/Users/solomongreene/DIS/DIS Core Course/./dat...,XC909299,Geothlypis trichas_Common Yellowthroat,Common Yellowthroat
998,/Users/solomongreene/DIS/DIS Core Course/./dat...,XC828478,Geothlypis trichas_Common Yellowthroat,Common Yellowthroat


In [None]:
os.path.join(os.getcwd(), 'data', 'bird_dataset', 'Geothlypis trichas_Common Yellowthroat', 'short_XC105134.wav')

In [None]:
file_list = []
for row in df_files_wav.itertuples(index=False):  # Avoid including the index
    file_path = row.fullfilename
    output_path = os.path.join(os.getcwd(), 'data', 'bird_dataset_shortened', 'Geothlypis trichas_Common Yellowthroat', 'XC105134.wav')
    extract_first_30_seconds(file_path, output_path)
    file_list.append({
                    'fullfilename': output_path,
                    'filename': Path(output_path).stem,
                    'species': species_folder  # The folder name is the species
                })

df_files_wav_shortened = pd.DataFrame(file_list)

In [None]:
import os
from pathlib import Path

file_list = []
for row in df_files_wav.itertuples(index=False):  # Avoid including the index
    file_path = row.fullfilename
    
    # Construct the output path
    species_folder = row.species  # Adjust this if the species info is stored elsewhere
    output_dir = os.path.join(os.getcwd(), 'data', 'bird_dataset_shortened', species_folder)
    output_path = os.path.join(output_dir, Path(file_path).name)  # Use original file name
    
    # Create the directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    try:
        # Extract the first 30 seconds of the audio
        extract_first_30_seconds(file_path, output_path)
        
        # Append to the list with updated file information
        file_list.append({
            'fullfilename': output_path,
            'filename': Path(output_path).stem,
            'species': species_folder  # Assuming the folder name is the species
        })
    except Exception as e:
        print(f"An error occurred while processing {file_path}: {e}")

# Create a DataFrame from the processed files
df_files_wav_shortened = pd.DataFrame(file_list)

In [12]:
import pandas as pd
from pathlib import Path
import os

# Directory where the bird dataset is stored
XC_ROOTDIR = './data/'
XC_DIR = 'bird_dataset_shortened'

# Initialize an empty list to store file details
filelist = []

# Walk through the directories of species
species_dir = os.path.join(os.getcwd(), XC_ROOTDIR, XC_DIR)
for species_folder in os.listdir(species_dir):
    species_path = os.path.join(species_dir, species_folder)
    if os.path.isdir(species_path):  # Ensure it's a directory
        for file_name in os.listdir(species_path):
            if file_name.endswith('.wav'):  # Filter for .wav files
                filelist.append({
                    'fullfilename': os.path.join(species_path, file_name),
                    'filename': Path(file_name).stem,
                    'species': species_folder  # The folder name is the species
                })

# Create a DataFrame with the gathered file details
df_files_wav = pd.DataFrame(filelist)

# Display a summary of the DataFrame
print('=====================================================')
print(f'Number of files: {len(df_files_wav)}')
print(f'Number of species: {len(df_files_wav["species"].unique())}')
print('=====================================================')

# Print the first few rows of the DataFrame to verify
print(df_files_wav.head())

Number of files: 1000
Number of species: 20
                                        fullfilename  filename  \
0  /Users/solomongreene/DIS/DIS Core Course/./dat...  XC721665   
1  /Users/solomongreene/DIS/DIS Core Course/./dat...  XC691616   
2  /Users/solomongreene/DIS/DIS Core Course/./dat...  XC140016   
3  /Users/solomongreene/DIS/DIS Core Course/./dat...  XC133365   
4  /Users/solomongreene/DIS/DIS Core Course/./dat...  XC624508   

                               species  
0  Setophaga magnolia_Magnolia Warbler  
1  Setophaga magnolia_Magnolia Warbler  
2  Setophaga magnolia_Magnolia Warbler  
3  Setophaga magnolia_Magnolia Warbler  
4  Setophaga magnolia_Magnolia Warbler  


In [15]:
df_files_wav.iloc[0]['fullfilename']

'/Users/solomongreene/DIS/DIS Core Course/./data/bird_dataset_shortened/Setophaga magnolia_Magnolia Warbler/XC721665.wav'

In [16]:
df_files_wav['commonname'] = df_files_wav['species'].apply(lambda x: x.split('_')[1])

In [18]:
df_files_wav

Unnamed: 0,fullfilename,filename,species,commonname
0,/Users/solomongreene/DIS/DIS Core Course/./dat...,XC721665,Setophaga magnolia_Magnolia Warbler,Magnolia Warbler
1,/Users/solomongreene/DIS/DIS Core Course/./dat...,XC691616,Setophaga magnolia_Magnolia Warbler,Magnolia Warbler
2,/Users/solomongreene/DIS/DIS Core Course/./dat...,XC140016,Setophaga magnolia_Magnolia Warbler,Magnolia Warbler
3,/Users/solomongreene/DIS/DIS Core Course/./dat...,XC133365,Setophaga magnolia_Magnolia Warbler,Magnolia Warbler
4,/Users/solomongreene/DIS/DIS Core Course/./dat...,XC624508,Setophaga magnolia_Magnolia Warbler,Magnolia Warbler
...,...,...,...,...
995,/Users/solomongreene/DIS/DIS Core Course/./dat...,XC447423,Geothlypis trichas_Common Yellowthroat,Common Yellowthroat
996,/Users/solomongreene/DIS/DIS Core Course/./dat...,XC556704,Geothlypis trichas_Common Yellowthroat,Common Yellowthroat
997,/Users/solomongreene/DIS/DIS Core Course/./dat...,XC909299,Geothlypis trichas_Common Yellowthroat,Common Yellowthroat
998,/Users/solomongreene/DIS/DIS Core Course/./dat...,XC828478,Geothlypis trichas_Common Yellowthroat,Common Yellowthroat
