In [1]:
import os
import pandas as pd
import numpy as np
import torchaudio
import torch
from transformers import AutoProcessor, EncodecModel
import requests
from tqdm import tqdm

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Function to download an audio file from a URL
def download_audio(url, save_path):
    url = "https:" + url
    response = requests.get(url, stream=True)
    with open(save_path, 'wb') as file:
        for chunk in response.iter_content(chunk_size=8192):
            file.write(chunk)
    return save_path

# Function to process and encode a single WAV file
def encode_wav_file(file_path):
    # Load the audio file
    waveform, sample_rate = torchaudio.load(file_path)
    
    # Resample to 24kHz if necessary
    if sample_rate != 24000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=24000)
        waveform = resampler(waveform)

    waveform = waveform.squeeze().numpy()

    # Ensure that the audio has a single channel
    if waveform.ndim > 1:
        waveform = waveform[0]
    
    # Encode the waveform
    inputs = processor(waveform, sampling_rate=24000, return_tensors="pt").to(device)
    with torch.no_grad():
        audio_codes = model.encode(inputs["input_values"], inputs["padding_mask"], 24.0)
    return audio_codes[0].squeeze().cpu().numpy()

In [22]:
import os
import pandas as pd
import numpy as np
import torch
import torchaudio
import requests
from tqdm import tqdm
from transformers import AutoProcessor, EncodecModel
!pip install sounddevice
import sounddevice as sd

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Function to download an audio file from a URL
def download_audio(url, save_path):
    url = "https:" + url
    response = requests.get(url, stream=True)
    with open(save_path, 'wb') as file:
        for chunk in response.iter_content(chunk_size=8192):
            file.write(chunk)
    return save_path

# Function to process and encode a single WAV file
def encode_wav_file(file_path):
    # Load the audio file
    waveform, sample_rate = torchaudio.load(file_path)
    
    # Resample to 24kHz if necessary
    if sample_rate != 24000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=24000)
        waveform = resampler(waveform)

    waveform = waveform.squeeze().numpy()

    # Ensure that the audio has a single channel
    if waveform.ndim > 1:
        waveform = waveform[0]
    
    # Encode the waveform
    inputs = processor(waveform, sampling_rate=24000, return_tensors="pt").to(device)
    with torch.no_grad():
        audio_codes = model.encode(inputs["input_values"], inputs["padding_mask"], 1.5)
        print('audio codes', audio_codes[0].shape)
        
    return audio_codes[0].squeeze().cpu().numpy()

# Function to decode audio codes and play the audio
def decode_and_play_audio(audio_codes):
    audio_codes_tensor = torch.from_numpy(audio_codes).unsqueeze(0).unsqueeze(0).to(device)
    print('shape', audio_codes_tensor.shape)
    
    # Decode the audio codes
    with torch.no_grad():
        decoded_waveform = model.decode(audio_codes_tensor, [None])[0].cpu().numpy()

    audio_np = decoded_waveform.squeeze()
    print('decoded', audio_np.shape)
    # Play the decoded audio
    sd.play(audio_np, samplerate=24000)
    sd.wait()

# Load the Encodec model and processor
processor = AutoProcessor.from_pretrained("facebook/encodec_24khz")
model = EncodecModel.from_pretrained("facebook/encodec_24khz").to(device)

# Read the CSV files
amazonas_df = pd.read_csv('./dataset/amazonas.csv')
manaus_df = pd.read_csv('./dataset/greater_manaus.csv')

# Combine the dataframes
combined_df = pd.concat([amazonas_df, manaus_df], ignore_index=True)

# Prepare the final dataset
final_dataset = []

# Process each row in the combined dataframe
for index, row in tqdm(combined_df.iterrows(), total=combined_df.shape[0]):
    common_name = row['Common Name']
    
    mp3_link = row['MP3 Link']
    
    if pd.notna(mp3_link):
        # Download the audio file
        file_path = download_audio(mp3_link, f"temp_audio_{index}.mp3")
        
        try:
            # Encode the audio file
            audio_codes = encode_wav_file(file_path)
            decode_and_play_audio(audio_codes)
            final_dataset.append((common_name, audio_codes))
        except Exception as e:
            print(f"Error processing file {file_path}: {e}")
        finally:
            # Remove the temporary audio file
            os.remove(file_path)

# Create a structured numpy array
dtype = [('Common Name', 'U50'), ('Audio Codes', 'O')]
structured_array = np.array(final_dataset, dtype=dtype)

# Save the structured array to an npy file
np.save('dataset/bird_dataset.npy', structured_array)



  self.register_buffer("padding_total", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False)
  0%|                                                                                         | 0/5339 [00:00<?, ?it/s]

audio codes torch.Size([1, 1, 2, 1541])
shape torch.Size([1, 1, 2, 1541])
decoded (493120,)


  0%|                                                                              | 1/5339 [00:23<34:27:13, 23.24s/it]

audio codes torch.Size([1, 1, 2, 3731])
shape torch.Size([1, 1, 2, 3731])
decoded (1193920,)


  0%|                                                                             | 1/5339 [01:14<109:46:00, 74.03s/it]


KeyboardInterrupt: 

In [13]:
 import pandas as pd

# Load the final_dataset.csv
final_df = pd.read_csv('final_dataset.csv')

# Calculate the frequency of each common name
common_name_frequency = final_df['Common Name'].value_counts()

# Get the top 10 most frequent common names
top_10_common_names = common_name_frequency.head(10).index.tolist()

# Create a test dataset by selecting one instance of each of the top 10 common names
test_df = final_df[final_df['Common Name'].isin(top_10_common_names)].groupby('Common Name').head(1)

# Remove these instances from the final dataset
final_df_filtered = final_df.drop(test_df.index)

# Save the updated final dataset and the test dataset to CSV files
final_df_filtered.to_csv('dataset/train.csv', index=False)
test_df.to_csv('dataset/test.csv', index=False)

In [1]:
import os
import pandas as pd
import requests
from tqdm import tqdm

# Create the data directory if it doesn't exist
os.makedirs("data", exist_ok=True)

# Function to download an audio file from a URL
def download_audio(url, save_path):
    url = "https:" + url
    response = requests.get(url, stream=True)
    with open(save_path, 'wb') as file:
        for chunk in response.iter_content(chunk_size=8192):
            file.write(chunk)
    return save_path

# Read the CSV files
amazonas_df = pd.read_csv('./dataset/amazonas.csv')
manaus_df = pd.read_csv('./dataset/greater_manaus.csv')

# Combine the dataframes
combined_df = pd.concat([amazonas_df, manaus_df], ignore_index=True)

# Prepare the final dataset
final_dataset = []

# Process each row in the combined dataframe
for index, row in tqdm(combined_df.iterrows(), total=combined_df.shape[0]):
    common_name = row['Common Name']
    mp3_link = row['MP3 Link']
    
    if pd.notna(mp3_link):
        # Define the save path
        save_path = os.path.join("data", f"audio_{index}.mp3")
        
        # Download the audio file
        file_path = download_audio(mp3_link, save_path)
        
        final_dataset.append({'Common Name': common_name, 'File Path': file_path})

# Convert the final dataset to a DataFrame
final_df = pd.DataFrame(final_dataset)

# Save the final dataset to a CSV file
final_df.to_csv('final_dataset_wav.csv', index=False)

# Load the final_dataset.csv
final_df = pd.read_csv('final_dataset_wav.csv')

# Calculate the frequency of each common name
common_name_frequency = final_df['Common Name'].value_counts()

# Get the top 10 most frequent common names
top_10_common_names = common_name_frequency.head(10).index.tolist()

# Create a test dataset by selecting one instance of each of the top 10 common names
test_df = final_df[final_df['Common Name'].isin(top_10_common_names)].groupby('Common Name').head(1)

# Remove these instances from the final dataset
final_df_filtered = final_df.drop(test_df.index)

# Save the updated final dataset and the test dataset to CSV files
final_df_filtered.to_csv('dataset/train_wav.csv', index=False)
test_df.to_csv('dataset/test_wav.csv', index=False)

100%|██████████████████████████████████████████████████████████████████████████████| 5339/5339 [47:00<00:00,  1.89it/s]
